This is an R Markdown document used to represent exploratory data analysis on the loan data provided by https://www.lendingclub.com/info/download-data.action
library(readr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
#Reading the Loan Data csv file as data.frame
LoanStats <- read_csv("C:/Users/Bhuvan/Desktop/Variable_Selection_Project/LoanStats.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## loan_amnt = col_integer(),
## funded_amnt = col_integer(),
## funded_amnt_inv = col_double(),
## installment = col_double(),
## annual_inc = col_double(),
## dti = col_double(),
## delinq_2yrs = col_integer(),
## inq_last_6mths = col_integer(),
## mths_since_last_delinq = col_integer(),
## mths_since_last_record = col_integer(),
## open_acc = col_integer(),
## pub_rec = col_integer(),
## revol_bal = col_integer(),
## total_acc = col_integer(),
## out_prncp = col_integer(),
## out_prncp_inv = col_integer(),
## total_pymnt = col_double(),
## total_pymnt_inv = col_double(),
## total_rec_prncp = col_double(),
## total_rec_int = col_double()
## # ... with 11 more columns
## )
## See spec(...) for full column specifications.
nrow(LoanStats)
## [1] 42535
ncol(LoanStats)
## [1] 122
#Removing the columns with no data
LoanStats1 <- LoanStats[,!apply(LoanStats, 2, function(x)
all(gsub(" ", "", x)=="", na.rm=TRUE))]
nrow(LoanStats1)
## [1] 42535
ncol(LoanStats1)
## [1] 54
#Composition of all the portfolio
library(sqldf)
LoanStats_agg <- sqldf('select purpose, count(*) as counts,
sum(loan_amnt) as total_exposure from
LoanStats1 group by purpose')
## Loading required package: tcltk
## Warning: Quoted identifiers should have class SQL, use DBI::SQL() if the
## caller performs the quoting.
LoanStats_agg$tot_loan_cnt <- as.numeric(sum(LoanStats_agg$counts))
LoanStats_agg$pct_cnt <- as.numeric(round(LoanStats_agg$counts/LoanStats_agg$tot_loan_cnt*100,2))
LoanStats_agg$tot_loan_amt <- as.numeric(sum(LoanStats_agg$total_exposure))
LoanStats_agg$pct_exposure <- as.numeric(round(LoanStats_agg$total_exposure/LoanStats_agg$tot_loan_amt*100,2))
lbls <- LoanStats_agg$purpose
lbls1 <- paste(lbls, LoanStats_agg$pct_exposure) # add percents to labels
lbls1 <- paste(lbls1,"%",sep="") # ad % to labels
lbls2 <- paste(lbls, LoanStats_agg$pct_cnt) # add percents to labels
lbls2 <- paste(lbls2,"%",sep="") # ad % to labels
pie(LoanStats_agg$total_exposure,labels = lbls1, col=rainbow(length(lbls)),
radius = 1, main="Portfolio wise Total Exposure ",cex=0.75)
pie(LoanStats_agg$counts,labels = lbls2, col=rainbow(length(lbls)),
radius = 1, main="Portfolio wise Number of loans",cex=0.75)
So as the debt consolidation represented the major portion of the loan data, we have done analysis for only debt consolidation portfolio.
ind_dc <- which(LoanStats1$purpose == 'debt_consolidation')
LoanStats2 <- LoanStats1[ind_dc,]
sqldf('select loan_status, count(*) as num_loans from LoanStats2
group by loan_status')
## loan_status num_loans
## 1 Charged Off 2792
## 2 Does not meet the credit policy. Status:Charged Off 292
## 3 Does not meet the credit policy. Status:Fully Paid 808
## 4 Fully Paid 15884
#cleaning the loan status variable
LoanStats2$loan_status_new <- gsub('Does not meet the credit policy. Status:'
,'',LoanStats2$loan_status)
#converting the interest and revolving utilization rate to number
LoanStats2$int_rate_num <- as.numeric(gsub('%','',LoanStats2$int_rate))
LoanStats2$revol_util_num <- as.numeric(gsub('%','',LoanStats2$revol_util))
#Creating total loan count and total exposure variable
LoanStats2$tot_loan_amt <- as.numeric(sum(LoanStats2$loan_amnt))
LoanStats2$tot_loan_cnt <- as.numeric(nrow(LoanStats2))
Initially we had 54 variables and based on our analysis we have rejected n variables.
Below are the some of the rejected variables with our reason.
i> Variables with same unique value throughout. eg policy_code, application_type, pymnt_plan
ii> Variables having free text values & needs extensive text mining. eg desc, title, emp_title
iii> Variables having major portion as blank or unique with minor stake in exposure. eg mths_since_last_record, pub_rec, chargeoff_within_12_mths
LoanStats2$tot_loan_amt <- as.numeric(sum(LoanStats2$loan_amnt))
LoanStats2$tot_loan_cnt <- as.numeric(nrow(LoanStats2))
library(sqldf)
pub_rec_agg <- sqldf('select pub_rec,
loan_status_new,count(*) as cnts,
sum(loan_amnt) as exposure,
round(sum(loan_amnt)/tot_loan_amt*100,2) as exp_pct
from LoanStats2 group by pub_rec,
loan_status_new')
pub_rec_agg
## pub_rec loan_status_new cnts exposure exp_pct
## 1 NA Fully Paid 1 1900 0.00
## 2 0 Charged Off 2787 37229325 14.99
## 3 0 Fully Paid 15841 198782825 80.03
## 4 1 Charged Off 294 3481150 1.40
## 5 1 Fully Paid 827 8621100 3.47
## 6 2 Charged Off 3 31950 0.01
## 7 2 Fully Paid 19 189000 0.08
## 8 3 Fully Paid 3 32400 0.01
## 9 4 Fully Paid 1 7200 0.00
LoanStats2$tot_loan_amt <- as.numeric(sum(LoanStats2$loan_amnt))
LoanStats2$tot_loan_cnt <- as.numeric(nrow(LoanStats2))
library(sqldf)
chargeoff_within_12_mths_agg <- sqldf('select chargeoff_within_12_mths,
loan_status_new,count(*) as cnts,
sum(loan_amnt) as exposure,
round(sum(loan_amnt)/tot_loan_amt*100,2) as exp_pct
from LoanStats2 group by chargeoff_within_12_mths,
loan_status_new')
chargeoff_within_12_mths_agg
## chargeoff_within_12_mths loan_status_new cnts exposure exp_pct
## 1 NA Charged Off 6 74725 0.03
## 2 NA Fully Paid 28 145850 0.06
## 3 0 Charged Off 3078 40667700 16.37
## 4 0 Fully Paid 16664 207488575 83.54
summary(LoanStats2$loan_amnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 700 7000 11000 12560 16500 35000
plot(LoanStats2$loan_amnt, ylab = "Loan Amount")
hist(LoanStats2$loan_amnt, breaks = 20,
xlab = "Loan Amount", col="#CCCCFF",
main="Histogram of Loan Amount Distribution")
a <- quantile(LoanStats2$loan_amnt,.90)
b <- quantile(LoanStats2$loan_amnt,0)
ind_la <- which(LoanStats2$loan_amnt < a & LoanStats2$loan_amnt > b)
LoanStats_la <- LoanStats2[ind_la,]
hist(LoanStats_la$loan_amnt, breaks = 10,
xlab = "Loan Amount", col="#CCCCFF",
main="Histogram of Loan Amount Distribution")
summary(LoanStats2$loan_amnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 700 7000 11000 12560 16500 35000
summary(LoanStats_la$loan_amnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 800 6200 10000 10707 15000 23975
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_la)/no_bins
unique(cut2(LoanStats_la$loan_amnt, m=no_cases_bin))
LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la[which(LoanStats_la$loan_status_new == 'Charged Off'),]$loan_amnt))
LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la[which(LoanStats_la$loan_status_new == 'Charged Off'),]))
LoanStats_la$loan_amnt_bin <- cut2(LoanStats_la$loan_amnt, m=no_cases_bin)
loan_amnt_bin <- sqldf('select loan_amnt_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_la group by loan_amnt_bin, loan_status_new')
loan_amnt_bin_tran <- reshape(loan_amnt_bin, idvar = "loan_amnt_bin",
timevar = "loan_status_new",
direction = "wide")
loan_amnt_bin_tran <-data.frame(loan_amnt_bin=loan_amnt_bin_tran$loan_amnt_bin,
charge_off_Cnts=loan_amnt_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=loan_amnt_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=loan_amnt_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=loan_amnt_bin_tran$`exposure.Fully Paid`,
annual_inc_total=loan_amnt_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=loan_amnt_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=loan_amnt_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(loan_amnt_bin_tran$`cnts.Charged Off`/(loan_amnt_bin_tran$`cnts.Charged Off`+
loan_amnt_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(loan_amnt_bin_tran$`exposure.Charged Off`/(loan_amnt_bin_tran$`exposure.Charged Off`+
loan_amnt_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(loan_amnt_bin_tran$`cnts.Charged Off`/loan_amnt_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(loan_amnt_bin_tran$`exposure.Charged Off`/loan_amnt_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- loan_amnt_bin_tran[,c("loan_amnt_bin","pct_cnts_grp")]
df2 <- loan_amnt_bin_tran[,c("loan_amnt_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=loan_amnt_bin_tran,
aes(x=loan_amnt_bin_tran$loan_amnt_bin,
y=loan_amnt_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Loan Amount bin", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$loan_amnt_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$loan_amnt_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$loan_amnt_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$loan_amnt_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
p3 <- ggplot() +
geom_line(data=df1,aes(df1$loan_amnt_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$loan_amnt_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "Loan Amount bin", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$loan_amnt_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$loan_amnt_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$loan_amnt_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$loan_amnt_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
retList <- list(p,p3)
return(retList)
}
bin_plot(12)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
## [[1]]
##
## [[2]]
bin_plot(9)
## [[1]]
##
## [[2]]
Inference
->Initially in 10 bins we can observe that the charge off rate remains almost same wrt to higher Loan amount bins. -> But as we converge the number of bins, we can observe a pattern, first charge off rate decreases and then after certain point it starts increasingto the maximum. -> We can say that the loans in the lowest and highest loan amount buckets tends to charge off more, that too more in the highest bucket.
#The total amount committed to that loan at that point in time.
summary(LoanStats2$funded_amnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 700 6825 10800 12257 16000 35000
#summary(LoanStats2$loan_amnt)
plot(LoanStats2$funded_amnt, ylab = "Total amount committed")
hist(LoanStats2$funded_amnt, breaks = 20,
xlab = "The total amount committed", col="#CCCCFF",
main="Histogram of Total amount committed")
a <- quantile(LoanStats2$funded_amnt,.90)
b <- quantile(LoanStats2$funded_amnt,0)
ind_la <- which(LoanStats2$funded_amnt < a & LoanStats2$funded_amnt > b)
LoanStats_la <- LoanStats2[ind_la,]
hist(LoanStats_la$funded_amnt, breaks = 10,
xlab = "Total amount committed", col="#CCCCFF",
main="Histogram of Total amount committed")
summary(LoanStats2$funded_amnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 700 6825 10800 12257 16000 35000
summary(LoanStats_la$funded_amnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 800 6025 10000 10565 14800 22725
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_la)/no_bins
unique(cut2(LoanStats_la$funded_amnt, m=no_cases_bin))
LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la[which(LoanStats_la$loan_status_new == 'Charged Off'),]$funded_amnt))
LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la[which(LoanStats_la$loan_status_new == 'Charged Off'),]))
LoanStats_la$funded_amnt_bin <- cut2(LoanStats_la$funded_amnt, m=no_cases_bin)
funded_amnt_bin <- sqldf('select funded_amnt_bin, loan_status_new, count(*) as
cnts, sum(funded_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(funded_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_la group by funded_amnt_bin, loan_status_new')
funded_amnt_bin_tran <- reshape(funded_amnt_bin, idvar = "funded_amnt_bin",
timevar = "loan_status_new",
direction = "wide")
funded_amnt_bin_tran <-data.frame(funded_amnt_bin=funded_amnt_bin_tran$funded_amnt_bin,
charge_off_Cnts=funded_amnt_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=funded_amnt_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=funded_amnt_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=funded_amnt_bin_tran$`exposure.Fully Paid`,
annual_inc_total=funded_amnt_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=funded_amnt_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=funded_amnt_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(funded_amnt_bin_tran$`cnts.Charged Off`/(funded_amnt_bin_tran$`cnts.Charged Off`+
funded_amnt_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(funded_amnt_bin_tran$`exposure.Charged Off`/(funded_amnt_bin_tran$`exposure.Charged Off`+
funded_amnt_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(funded_amnt_bin_tran$`cnts.Charged Off`/funded_amnt_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(funded_amnt_bin_tran$`exposure.Charged Off`/funded_amnt_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- funded_amnt_bin_tran[,c("funded_amnt_bin","pct_cnts_grp")]
df2 <- funded_amnt_bin_tran[,c("funded_amnt_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=funded_amnt_bin_tran,
aes(x=funded_amnt_bin_tran$funded_amnt_bin,
y=funded_amnt_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Funded Amount bin", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$funded_amnt_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$funded_amnt_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$funded_amnt_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$funded_amnt_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
p3 <- ggplot() +
geom_line(data=df1,aes(df1$funded_amnt_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$funded_amnt_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "Funded Amount bin", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$funded_amnt_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$funded_amnt_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$funded_amnt_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$funded_amnt_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
retList <- list(p,p3)
return(retList)
}
bin_plot(12)
## [[1]]
##
## [[2]]
bin_plot(9)
## [[1]]
##
## [[2]]
Inference -> Almost similar kind of pattern as loan amount.
Inference -> Almost similar kind of pattern as loan amount.
# Interest Rate Distribution and Analysis
summary(LoanStats2$int_rate_num)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.42 10.00 12.61 12.61 15.21 24.11
plot(LoanStats2$int_rate_num, ylab = "Interest Rate")
hist(LoanStats2$int_rate_num, breaks = 20,
xlab = "Interest Rate", col="#CCCCFF",
main="Histogram of Interest Rate Distribution")
library(Hmisc)
# Creating 10 bins for interest rate
no_bins <- 10
no_cases_bin <- nrow(LoanStats2)/no_bins
LoanStats2$tot_loan_amt <- as.numeric(sum(LoanStats2$loan_amnt))
LoanStats2$tot_loan_cnt <- as.numeric(nrow(LoanStats2))
LoanStats2$int_rate_bin <- cut2(LoanStats2$int_rate_num, m=no_cases_bin)
#Aggregating the data interest rate bin wise
intrate_bin <- sqldf('select int_rate_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats2 group by int_rate_bin, loan_status_new')
intrate_bin_tran <- reshape(intrate_bin, idvar = "int_rate_bin",
timevar = "loan_status_new",
direction = "wide")
intrate_bin_tran <-data.frame(int_rate_bin=intrate_bin_tran$int_rate_bin,
charge_off_Cnts=intrate_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=intrate_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=intrate_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=intrate_bin_tran$`exposure.Fully Paid`,
tot_loan_amt=intrate_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=intrate_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(intrate_bin_tran$`cnts.Charged Off`/(intrate_bin_tran$`cnts.Charged Off`+
intrate_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(intrate_bin_tran$`exposure.Charged Off`/(intrate_bin_tran$`exposure.Charged Off`+
intrate_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(intrate_bin_tran$`cnts.Charged Off`/intrate_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(intrate_bin_tran$`exposure.Charged Off`/intrate_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- intrate_bin_tran[,c("int_rate_bin","pct_cnts_grp")]
df2 <- intrate_bin_tran[,c("int_rate_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=intrate_bin_tran,
aes(x=intrate_bin_tran$int_rate_bin,
y=intrate_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Interest Rate Bins", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$int_rate_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$int_rate_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$int_rate_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$int_rate_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
Inference -> Here there is a clear and linear relationship between Interest rates and the charge off rates. -> Rate of charge off increase as we move in higher interest rate bins.
#Installment Distribution and Analysis
summary(LoanStats2$installment)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.22 205.86 327.14 364.14 482.65 1305.19
plot(LoanStats2$installment, ylab = "Installment Amount before")
hist(LoanStats2$installment)
hist(LoanStats2$installment, breaks = 20,
xlab = "Installment Amount", col="#CCCCFF",
main="Histogram of Installment Amount Distribution")
a <- quantile(LoanStats2$installment,.95)
b <- quantile(LoanStats2$installment,0)
ind_1 <- which(LoanStats2$installment < a & LoanStats2$installment > b)
LoanStats_ins <- LoanStats2[ind_1,]
plot(LoanStats_ins$installment, ylab = "Installment Amount after")
hist(LoanStats_ins$installment, breaks = 20,
xlab = "Installment Amount", col="#CCCCFF",
main="Histogram of Installment Amount Distribution")
#Before
summary(LoanStats2$installment)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.22 205.86 327.14 364.14 482.65 1305.19
#After
summary(LoanStats_ins$installment)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.25 199.80 316.24 335.93 451.21 792.72
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_ins)/no_bins
no_cases_bin
unique(cut2(LoanStats_ins$installment, m=no_cases_bin))
LoanStats_ins$tot_loan_amt <- as.numeric(sum(LoanStats_ins$installment))
LoanStats_ins$tot_loan_cnt <- as.numeric(nrow(LoanStats_ins))
LoanStats_ins$install_amt_bin <- cut2(LoanStats_ins$installment, m=no_cases_bin)
install_bin <- sqldf('select install_amt_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_ins group by install_amt_bin, loan_status_new')
install_bin_tran <- reshape(install_bin, idvar = "install_amt_bin",
timevar = "loan_status_new",
direction = "wide")
install_bin_tran <-data.frame(install_amt_bin=install_bin_tran$install_amt_bin,
charge_off_Cnts=install_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=install_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=install_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=install_bin_tran$`exposure.Fully Paid`,
tot_loan_amt=install_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=install_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(install_bin_tran$`cnts.Charged Off`/(install_bin_tran$`cnts.Charged Off`+
install_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(install_bin_tran$`exposure.Charged Off`/(install_bin_tran$`exposure.Charged Off`+
install_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(install_bin_tran$`cnts.Charged Off`/install_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(install_bin_tran$`exposure.Charged Off`/install_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- install_bin_tran[,c("install_amt_bin","pct_cnts_grp")]
df2 <- install_bin_tran[,c("install_amt_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=install_bin_tran,
aes(x=install_bin_tran$install_amt_bin,
y=install_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Installment Amount Bins", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$install_amt_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$install_amt_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$install_amt_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$install_amt_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
return(p)
}
bin_plot(12)
bin_plot(10)
Inference -> Here also no clear relationship is there between installment amount and the charge off rate.
#Annual Income Distribution and Analysis
summary(LoanStats2$annual_inc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1896 41000 58000 67329 80000 2039784
plot(LoanStats2$annual_inc, ylab = "Annual Income Amount before")
hist(LoanStats2$annual_inc, breaks = 20,
xlab = "Annual Income", col="#CCCCFF",
main="Histogram of Annual Income Distribution before")
a <- quantile(LoanStats2$annual_inc,.90)
b <- quantile(LoanStats2$annual_inc,0)
ind_1 <- which(LoanStats2$annual_inc < a & LoanStats2$annual_inc > b)
LoanStats_ins <- LoanStats2[ind_1,]
plot(LoanStats_ins$annual_inc, ylab = "Annual Income Amount after")
hist(LoanStats_ins$annual_inc, breaks = 20,
xlab = "Loan Amount", col="#CCCCFF",
main="Histogram of Annual Income Distribution after")
summary(LoanStats2$annual_inc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1896 41000 58000 67329 80000 2039784
summary(LoanStats_ins$annual_inc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4080 40000 54000 56342 72000 109995
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_ins)/no_bins
unique(cut2(LoanStats_ins$annual_inc, m=no_cases_bin))
LoanStats_ins$tot_loan_amt <- as.numeric(sum(LoanStats_ins$annual_inc))
LoanStats_ins$tot_loan_cnt <- as.numeric(nrow(LoanStats_ins))
LoanStats_ins$annual_inc_bin <- cut2(LoanStats_ins$annual_inc, m=no_cases_bin)
annual_inc_bin <- sqldf('select annual_inc_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_ins group by annual_inc_bin, loan_status_new')
annual_inc_bin_tran <- reshape(annual_inc_bin, idvar = "annual_inc_bin",
timevar = "loan_status_new",
direction = "wide")
annual_inc_bin_tran <-data.frame(annual_inc_bin=annual_inc_bin_tran$annual_inc_bin,
charge_off_Cnts=annual_inc_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=annual_inc_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=annual_inc_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=annual_inc_bin_tran$`exposure.Fully Paid`,
tot_loan_amt=annual_inc_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=annual_inc_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(annual_inc_bin_tran$`cnts.Charged Off`/(annual_inc_bin_tran$`cnts.Charged Off`+
annual_inc_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(annual_inc_bin_tran$`exposure.Charged Off`/(annual_inc_bin_tran$`exposure.Charged Off`+
annual_inc_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(annual_inc_bin_tran$`cnts.Charged Off`/annual_inc_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(annual_inc_bin_tran$`exposure.Charged Off`/annual_inc_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- annual_inc_bin_tran[,c("annual_inc_bin","pct_cnts_grp")]
df2 <- annual_inc_bin_tran[,c("annual_inc_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=annual_inc_bin_tran,
aes(x=annual_inc_bin_tran$annual_inc_bin,
y=annual_inc_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Annual Income Bins", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$annual_inc_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$annual_inc_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$annual_inc_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$annual_inc_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p3 <- ggplot() +
geom_line(data=df1,aes(df1$annual_inc_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$annual_inc_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "Annual Income Bins", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$annual_inc_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$annual_inc_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$annual_inc_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$annual_inc_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
p3
retList <- list(p,p3)
return(retList)
}
bin_plot(12)
## [[1]]
##
## [[2]]
bin_plot(8)
## [[1]]
##
## [[2]]
Inference -> Initially we observed that rate of charge off was little zig-zag in 10 bins graph. -> But as we tried to reduce the bins it became evident that the rate of charge off decreases as wemove towards highr salary bins. -> So as we expect the individuals with lower salaries tends to charge off more.
#Debt to Incom Ratio Distribution and Analysis
summary(LoanStats2$dti)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.78 14.72 14.55 19.60 29.96
plot(LoanStats2$dti, ylab = "Debt to Incom Ratio before")
hist(LoanStats2$dti, breaks = 30,
xlab = "Debt to Incom Ratio", col="#CCCCFF",
main="Histogram of Debt to Incom Ratio Distribution")
a <- quantile(LoanStats2$dti,1)
b <- quantile(LoanStats2$dti,.05)
ind_1 <- which(LoanStats2$dti < a & LoanStats2$dti > b)
LoanStats_ins <- LoanStats2[ind_1,]
plot(LoanStats_ins$dti, ylab = "Debt to Incom Ratio after")
hist(LoanStats_ins$dti, breaks = 40,
xlab = "Debt to Incom Ratio", col="#CCCCFF",
main="Histogram of Debt to Incom Ratio Distribution")
summary(LoanStats2$dti)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.78 14.72 14.55 19.60 29.96
summary(LoanStats_ins$dti)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.90 10.62 15.17 15.20 19.84 29.93
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_ins)/no_bins
unique(cut2(LoanStats_ins$dti, m=no_cases_bin))
LoanStats_ins$tot_loan_amt <- as.numeric(sum(LoanStats_ins$dti))
LoanStats_ins$tot_loan_cnt <- as.numeric(nrow(LoanStats_ins))
LoanStats_ins$dti_bin <- cut2(LoanStats_ins$dti, m=no_cases_bin)
dti_bin <- sqldf('select dti_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_ins group by dti_bin, loan_status_new')
dti_bin_tran <- reshape(dti_bin, idvar = "dti_bin",
timevar = "loan_status_new",
direction = "wide")
dti_bin_tran <-data.frame(dti_bin=dti_bin_tran$dti_bin,
charge_off_Cnts=dti_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=dti_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=dti_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=dti_bin_tran$`exposure.Fully Paid`,
annual_inc_total=dti_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=dti_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=dti_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(dti_bin_tran$`cnts.Charged Off`/(dti_bin_tran$`cnts.Charged Off`+
dti_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(dti_bin_tran$`exposure.Charged Off`/(dti_bin_tran$`exposure.Charged Off`+
dti_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(dti_bin_tran$`cnts.Charged Off`/dti_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(dti_bin_tran$`exposure.Charged Off`/dti_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- dti_bin_tran[,c("dti_bin","pct_cnts_grp")]
df2 <- dti_bin_tran[,c("dti_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=dti_bin_tran,
aes(x=dti_bin_tran$dti_bin,
y=dti_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Debt to Incom Ratio Bins", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$dti_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$dti_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$dti_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$dti_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p3 <- ggplot() +
geom_line(data=df1,aes(df1$dti_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$dti_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "Debt to Income Ratio Bins", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$dti_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$dti_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$dti_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$dti_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
p3
retList <- list(p,p3)
return(retList)
}
bin_plot(12)
## [[1]]
##
## [[2]]
bin_plot(10)
## [[1]]
##
## [[2]]
Inference -> Initially we observed that rate of charge off was little zig-zag in 10 bins graph. -> But as we tried to reduce the bins it became evident that the rate of charge off increases as we move towards higher dti bins. -> So as we expect the individuals with higher debt burden tends to charge off more.
#The number of inquiries in past 6 months Distribution and Analysis
summary(LoanStats2$inq_last_6mths)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 1.000 1.007 2.000 33.000 1
ind_na <- which(is.na(LoanStats2$inq_last_6mths))
LoanStats3 <- LoanStats2[-ind_na,]
summary(LoanStats3$inq_last_6mths)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.007 2.000 33.000
plot(LoanStats3$inq_last_6mths, ylab = "No of inquiries in 6 months before")
hist(LoanStats3$inq_last_6mths, breaks = 100,
xlab = "No of inquiries in 6 months", col="#CCCCFF",
main="Histogram of No of inquiries in 6 months Distribution")
a <- quantile(LoanStats3$inq_last_6mths,.99)
b <- quantile(LoanStats3$inq_last_6mths,0)
ind_1 <- which(LoanStats3$inq_last_6mths < a & LoanStats3$inq_last_6mths > b)
LoanStats_ins <- LoanStats3[ind_1,]
plot(LoanStats_ins$inq_last_6mths, ylab = "No of inquiries in 6 months after")
hist(LoanStats_ins$inq_last_6mths, breaks = 30,
xlab = "No of inquiries in 6 months", col="#CCCCFF",
main="Histogram of No of inquiries in 6 months Distribution")
ind_wo <- which(LoanStats3$inq_last_6mths < 10 )
LoanStats_wo <- LoanStats3[ind_wo,]
library(gmodels)
data.frame(CrossTable(LoanStats3$inq_last_6mths,LoanStats3$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 19775
##
##
## | LoanStats3$loan_status_new
## LoanStats3$inq_last_6mths | Charged Off | Fully Paid | Row Total |
## --------------------------|-------------|-------------|-------------|
## 0 | 1165 | 8218 | 9383 |
## | 0.124 | 0.876 | 0.474 |
## --------------------------|-------------|-------------|-------------|
## 1 | 878 | 4479 | 5357 |
## | 0.164 | 0.836 | 0.271 |
## --------------------------|-------------|-------------|-------------|
## 2 | 493 | 2214 | 2707 |
## | 0.182 | 0.818 | 0.137 |
## --------------------------|-------------|-------------|-------------|
## 3 | 310 | 1089 | 1399 |
## | 0.222 | 0.778 | 0.071 |
## --------------------------|-------------|-------------|-------------|
## 4 | 87 | 339 | 426 |
## | 0.204 | 0.796 | 0.022 |
## --------------------------|-------------|-------------|-------------|
## 5 | 55 | 174 | 229 |
## | 0.240 | 0.760 | 0.012 |
## --------------------------|-------------|-------------|-------------|
## 6 | 36 | 89 | 125 |
## | 0.288 | 0.712 | 0.006 |
## --------------------------|-------------|-------------|-------------|
## 7 | 26 | 36 | 62 |
## | 0.419 | 0.581 | 0.003 |
## --------------------------|-------------|-------------|-------------|
## 8 | 16 | 19 | 35 |
## | 0.457 | 0.543 | 0.002 |
## --------------------------|-------------|-------------|-------------|
## 9 | 7 | 12 | 19 |
## | 0.368 | 0.632 | 0.001 |
## --------------------------|-------------|-------------|-------------|
## 10 | 0 | 9 | 9 |
## | 0.000 | 1.000 | 0.000 |
## --------------------------|-------------|-------------|-------------|
## 11 | 2 | 3 | 5 |
## | 0.400 | 0.600 | 0.000 |
## --------------------------|-------------|-------------|-------------|
## 12 | 2 | 2 | 4 |
## | 0.500 | 0.500 | 0.000 |
## --------------------------|-------------|-------------|-------------|
## 13 | 1 | 1 | 2 |
## | 0.500 | 0.500 | 0.000 |
## --------------------------|-------------|-------------|-------------|
## 14 | 3 | 0 | 3 |
## | 1.000 | 0.000 | 0.000 |
## --------------------------|-------------|-------------|-------------|
## 15 | 1 | 3 | 4 |
## | 0.250 | 0.750 | 0.000 |
## --------------------------|-------------|-------------|-------------|
## 17 | 1 | 0 | 1 |
## | 1.000 | 0.000 | 0.000 |
## --------------------------|-------------|-------------|-------------|
## 18 | 0 | 1 | 1 |
## | 0.000 | 1.000 | 0.000 |
## --------------------------|-------------|-------------|-------------|
## 19 | 0 | 2 | 2 |
## | 0.000 | 1.000 | 0.000 |
## --------------------------|-------------|-------------|-------------|
## 28 | 0 | 1 | 1 |
## | 0.000 | 1.000 | 0.000 |
## --------------------------|-------------|-------------|-------------|
## 33 | 1 | 0 | 1 |
## | 1.000 | 0.000 | 0.000 |
## --------------------------|-------------|-------------|-------------|
## Column Total | 3084 | 16691 | 19775 |
## --------------------------|-------------|-------------|-------------|
##
##
## t.x t.y t.Freq prop.row.x prop.row.y prop.row.Freq prop.col.x
## 1 0 Charged Off 1165 0 Charged Off 0.1241607 0
## 2 1 Charged Off 878 1 Charged Off 0.1638977 1
## 3 2 Charged Off 493 2 Charged Off 0.1821204 2
## 4 3 Charged Off 310 3 Charged Off 0.2215868 3
## 5 4 Charged Off 87 4 Charged Off 0.2042254 4
## 6 5 Charged Off 55 5 Charged Off 0.2401747 5
## 7 6 Charged Off 36 6 Charged Off 0.2880000 6
## 8 7 Charged Off 26 7 Charged Off 0.4193548 7
## 9 8 Charged Off 16 8 Charged Off 0.4571429 8
## 10 9 Charged Off 7 9 Charged Off 0.3684211 9
## 11 10 Charged Off 0 10 Charged Off 0.0000000 10
## 12 11 Charged Off 2 11 Charged Off 0.4000000 11
## 13 12 Charged Off 2 12 Charged Off 0.5000000 12
## 14 13 Charged Off 1 13 Charged Off 0.5000000 13
## 15 14 Charged Off 3 14 Charged Off 1.0000000 14
## 16 15 Charged Off 1 15 Charged Off 0.2500000 15
## 17 17 Charged Off 1 17 Charged Off 1.0000000 17
## 18 18 Charged Off 0 18 Charged Off 0.0000000 18
## 19 19 Charged Off 0 19 Charged Off 0.0000000 19
## 20 28 Charged Off 0 28 Charged Off 0.0000000 28
## 21 33 Charged Off 1 33 Charged Off 1.0000000 33
## 22 0 Fully Paid 8218 0 Fully Paid 0.8758393 0
## 23 1 Fully Paid 4479 1 Fully Paid 0.8361023 1
## 24 2 Fully Paid 2214 2 Fully Paid 0.8178796 2
## 25 3 Fully Paid 1089 3 Fully Paid 0.7784132 3
## 26 4 Fully Paid 339 4 Fully Paid 0.7957746 4
## 27 5 Fully Paid 174 5 Fully Paid 0.7598253 5
## 28 6 Fully Paid 89 6 Fully Paid 0.7120000 6
## 29 7 Fully Paid 36 7 Fully Paid 0.5806452 7
## 30 8 Fully Paid 19 8 Fully Paid 0.5428571 8
## 31 9 Fully Paid 12 9 Fully Paid 0.6315789 9
## 32 10 Fully Paid 9 10 Fully Paid 1.0000000 10
## 33 11 Fully Paid 3 11 Fully Paid 0.6000000 11
## 34 12 Fully Paid 2 12 Fully Paid 0.5000000 12
## 35 13 Fully Paid 1 13 Fully Paid 0.5000000 13
## 36 14 Fully Paid 0 14 Fully Paid 0.0000000 14
## 37 15 Fully Paid 3 15 Fully Paid 0.7500000 15
## 38 17 Fully Paid 0 17 Fully Paid 0.0000000 17
## 39 18 Fully Paid 1 18 Fully Paid 1.0000000 18
## 40 19 Fully Paid 2 19 Fully Paid 1.0000000 19
## 41 28 Fully Paid 1 28 Fully Paid 1.0000000 28
## 42 33 Fully Paid 0 33 Fully Paid 0.0000000 33
## prop.col.y prop.col.Freq prop.tbl.x prop.tbl.y prop.tbl.Freq
## 1 Charged Off 3.777562e-01 0 Charged Off 0.0589127686
## 2 Charged Off 2.846952e-01 1 Charged Off 0.0443994943
## 3 Charged Off 1.598573e-01 2 Charged Off 0.0249304678
## 4 Charged Off 1.005188e-01 3 Charged Off 0.0156763590
## 5 Charged Off 2.821012e-02 4 Charged Off 0.0043994943
## 6 Charged Off 1.783398e-02 5 Charged Off 0.0027812895
## 7 Charged Off 1.167315e-02 6 Charged Off 0.0018204804
## 8 Charged Off 8.430610e-03 7 Charged Off 0.0013147914
## 9 Charged Off 5.188067e-03 8 Charged Off 0.0008091024
## 10 Charged Off 2.269780e-03 9 Charged Off 0.0003539823
## 11 Charged Off 0.000000e+00 10 Charged Off 0.0000000000
## 12 Charged Off 6.485084e-04 11 Charged Off 0.0001011378
## 13 Charged Off 6.485084e-04 12 Charged Off 0.0001011378
## 14 Charged Off 3.242542e-04 13 Charged Off 0.0000505689
## 15 Charged Off 9.727626e-04 14 Charged Off 0.0001517067
## 16 Charged Off 3.242542e-04 15 Charged Off 0.0000505689
## 17 Charged Off 3.242542e-04 17 Charged Off 0.0000505689
## 18 Charged Off 0.000000e+00 18 Charged Off 0.0000000000
## 19 Charged Off 0.000000e+00 19 Charged Off 0.0000000000
## 20 Charged Off 0.000000e+00 28 Charged Off 0.0000000000
## 21 Charged Off 3.242542e-04 33 Charged Off 0.0000505689
## 22 Fully Paid 4.923612e-01 0 Fully Paid 0.4155752212
## 23 Fully Paid 2.683482e-01 1 Fully Paid 0.2264981037
## 24 Fully Paid 1.326463e-01 2 Fully Paid 0.1119595449
## 25 Fully Paid 6.524474e-02 3 Fully Paid 0.0550695322
## 26 Fully Paid 2.031035e-02 4 Fully Paid 0.0171428571
## 27 Fully Paid 1.042478e-02 5 Fully Paid 0.0087989886
## 28 Fully Paid 5.332215e-03 6 Fully Paid 0.0045006321
## 29 Fully Paid 2.156851e-03 7 Fully Paid 0.0018204804
## 30 Fully Paid 1.138338e-03 8 Fully Paid 0.0009608091
## 31 Fully Paid 7.189503e-04 9 Fully Paid 0.0006068268
## 32 Fully Paid 5.392127e-04 10 Fully Paid 0.0004551201
## 33 Fully Paid 1.797376e-04 11 Fully Paid 0.0001517067
## 34 Fully Paid 1.198251e-04 12 Fully Paid 0.0001011378
## 35 Fully Paid 5.991253e-05 13 Fully Paid 0.0000505689
## 36 Fully Paid 0.000000e+00 14 Fully Paid 0.0000000000
## 37 Fully Paid 1.797376e-04 15 Fully Paid 0.0001517067
## 38 Fully Paid 0.000000e+00 17 Fully Paid 0.0000000000
## 39 Fully Paid 5.991253e-05 18 Fully Paid 0.0000505689
## 40 Fully Paid 1.198251e-04 19 Fully Paid 0.0001011378
## 41 Fully Paid 5.991253e-05 28 Fully Paid 0.0000505689
## 42 Fully Paid 0.000000e+00 33 Fully Paid 0.0000000000
summary(LoanStats3$inq_last_6mths)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.007 2.000 33.000
summary(LoanStats_wo$inq_last_6mths)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.9853 2.0000 9.0000
library(gmodels)
aaa <- data.frame(CrossTable(LoanStats_wo$inq_last_6mths,LoanStats_wo$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 19742
##
##
## | LoanStats_wo$loan_status_new
## LoanStats_wo$inq_last_6mths | Charged Off | Fully Paid | Row Total |
## ----------------------------|-------------|-------------|-------------|
## 0 | 1165 | 8218 | 9383 |
## | 0.124 | 0.876 | 0.475 |
## ----------------------------|-------------|-------------|-------------|
## 1 | 878 | 4479 | 5357 |
## | 0.164 | 0.836 | 0.271 |
## ----------------------------|-------------|-------------|-------------|
## 2 | 493 | 2214 | 2707 |
## | 0.182 | 0.818 | 0.137 |
## ----------------------------|-------------|-------------|-------------|
## 3 | 310 | 1089 | 1399 |
## | 0.222 | 0.778 | 0.071 |
## ----------------------------|-------------|-------------|-------------|
## 4 | 87 | 339 | 426 |
## | 0.204 | 0.796 | 0.022 |
## ----------------------------|-------------|-------------|-------------|
## 5 | 55 | 174 | 229 |
## | 0.240 | 0.760 | 0.012 |
## ----------------------------|-------------|-------------|-------------|
## 6 | 36 | 89 | 125 |
## | 0.288 | 0.712 | 0.006 |
## ----------------------------|-------------|-------------|-------------|
## 7 | 26 | 36 | 62 |
## | 0.419 | 0.581 | 0.003 |
## ----------------------------|-------------|-------------|-------------|
## 8 | 16 | 19 | 35 |
## | 0.457 | 0.543 | 0.002 |
## ----------------------------|-------------|-------------|-------------|
## 9 | 7 | 12 | 19 |
## | 0.368 | 0.632 | 0.001 |
## ----------------------------|-------------|-------------|-------------|
## Column Total | 3073 | 16669 | 19742 |
## ----------------------------|-------------|-------------|-------------|
##
##
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x",
timevar = "aaa.t.y",
direction = "wide")
p <- ggplot()+geom_bar(data=c,
aes(x=c$aaa.t.x,
y=c$`aaa.t.Freq.Charged Off`),
stat="identity",fill = "#CCCCFF", colour="black") +
labs(x = "No of inquiries in 6 months", y = "")
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: % Charge off in Bin
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: Legend
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range>
## reset: function
## scale_name: manual
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x,
c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_ins)/no_bins
unique(cut2(LoanStats_ins$inq_last_6mths, m=no_cases_bin))
LoanStats_ins$tot_loan_amt <- as.numeric(sum(LoanStats_ins$inq_last_6mths))
LoanStats_ins$tot_loan_cnt <- as.numeric(nrow(LoanStats_ins))
LoanStats_ins$inq_last_6mths_bin <- cut2(LoanStats_ins$inq_last_6mths, m=no_cases_bin)
inq_last_6mths_bin <- sqldf('select inq_last_6mths_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_ins group by inq_last_6mths_bin, loan_status_new')
inq_last_6mths_bin_tran <- reshape(inq_last_6mths_bin, idvar = "inq_last_6mths_bin",
timevar = "loan_status_new",
direction = "wide")
inq_last_6mths_bin_tran <-data.frame(inq_last_6mths_bin=inq_last_6mths_bin_tran$inq_last_6mths_bin,
charge_off_Cnts=inq_last_6mths_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=inq_last_6mths_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=inq_last_6mths_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=inq_last_6mths_bin_tran$`exposure.Fully Paid`,
annual_inc_total=inq_last_6mths_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=inq_last_6mths_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=inq_last_6mths_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(inq_last_6mths_bin_tran$`cnts.Charged Off`/(inq_last_6mths_bin_tran$`cnts.Charged Off`+
inq_last_6mths_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(inq_last_6mths_bin_tran$`exposure.Charged Off`/(inq_last_6mths_bin_tran$`exposure.Charged Off`+
inq_last_6mths_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(inq_last_6mths_bin_tran$`cnts.Charged Off`/inq_last_6mths_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(inq_last_6mths_bin_tran$`exposure.Charged Off`/inq_last_6mths_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- inq_last_6mths_bin_tran[,c("inq_last_6mths_bin","pct_cnts_grp")]
df2 <- inq_last_6mths_bin_tran[,c("inq_last_6mths_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=inq_last_6mths_bin_tran,
aes(x=inq_last_6mths_bin_tran$inq_last_6mths_bin,
y=inq_last_6mths_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "No of inquiries in 6 months Bins", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$inq_last_6mths_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$inq_last_6mths_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$inq_last_6mths_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$inq_last_6mths_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
p3 <- ggplot() +
geom_line(data=df1,aes(df1$inq_last_6mths_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$inq_last_6mths_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "No of inquiries in 6 months Bins", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$inq_last_6mths_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$inq_last_6mths_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$inq_last_6mths_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$inq_last_6mths_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
p3
retList <- list(p,p3)
return(retList)
}
bin_plot(11)
## [[1]]
##
## [[2]]
bin_plot(10)
## [[1]]
##
## [[2]]
Inference -> We can see the rate of charge off gradually inceases as we move in buckets of hinger number of inquiries. -> it became quite evident when we reduce the number of bins.
#The number of months since the borrower's last delinquency.
summary(LoanStats2$mths_since_last_delinq)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 18.00 34.00 35.75 52.00 115.00 12657
ind_na <- which(is.na(LoanStats2$mths_since_last_delinq))
LoanStats3 <- LoanStats2[-ind_na,]
summary(LoanStats3$mths_since_last_delinq)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 18.00 34.00 35.75 52.00 115.00
plot(LoanStats3$mths_since_last_delinq, ylab = "No of months since last delinquency")
hist(LoanStats3$mths_since_last_delinq, breaks = 20,
xlab = "No of months since last delinquency", col="#CCCCFF",
main="Histogram of No of months since last delinquency")
a <- quantile(LoanStats3$mths_since_last_delinq,.95)
b <- quantile(LoanStats3$mths_since_last_delinq,0)
ind_1 <- which(LoanStats3$mths_since_last_delinq < a & LoanStats3$mths_since_last_delinq > b)
LoanStats_ins <- LoanStats3[ind_1,]
plot(LoanStats_ins$mths_since_last_delinq, ylab = "No of months since last delinquency")
hist(LoanStats_ins$mths_since_last_delinq, breaks = 20,
xlab = "No of months since last delinquency", col="#CCCCFF",
main="Histogram of No of months since last delinquency")
summary(LoanStats3$mths_since_last_delinq)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 18.00 34.00 35.75 52.00 115.00
summary(LoanStats_ins$mths_since_last_delinq)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.00 34.00 34.95 49.00 74.00
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_ins)/no_bins
unique(cut2(LoanStats_ins$mths_since_last_delinq, m=no_cases_bin))
LoanStats_ins$tot_loan_amt <- as.numeric(sum(LoanStats_ins$mths_since_last_delinq))
LoanStats_ins$tot_loan_cnt <- as.numeric(nrow(LoanStats_ins))
LoanStats_ins$mths_since_last_delinq_bin <- cut2(LoanStats_ins$mths_since_last_delinq, m=no_cases_bin)
mths_since_last_delinq_bin <- sqldf('select mths_since_last_delinq_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_ins group by mths_since_last_delinq_bin, loan_status_new')
mths_since_last_delinq_bin_tran <- reshape(mths_since_last_delinq_bin, idvar = "mths_since_last_delinq_bin",
timevar = "loan_status_new",
direction = "wide")
mths_since_last_delinq_bin_tran <-data.frame(mths_since_last_delinq_bin=mths_since_last_delinq_bin_tran$mths_since_last_delinq_bin,
charge_off_Cnts=mths_since_last_delinq_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=mths_since_last_delinq_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=mths_since_last_delinq_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=mths_since_last_delinq_bin_tran$`exposure.Fully Paid`,
annual_inc_total=mths_since_last_delinq_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=mths_since_last_delinq_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=mths_since_last_delinq_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(mths_since_last_delinq_bin_tran$`cnts.Charged Off`/(mths_since_last_delinq_bin_tran$`cnts.Charged Off`+
mths_since_last_delinq_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(mths_since_last_delinq_bin_tran$`exposure.Charged Off`/(mths_since_last_delinq_bin_tran$`exposure.Charged Off`+
mths_since_last_delinq_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(mths_since_last_delinq_bin_tran$`cnts.Charged Off`/mths_since_last_delinq_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(mths_since_last_delinq_bin_tran$`exposure.Charged Off`/mths_since_last_delinq_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- mths_since_last_delinq_bin_tran[,c("mths_since_last_delinq_bin","pct_cnts_grp")]
df2 <- mths_since_last_delinq_bin_tran[,c("mths_since_last_delinq_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=mths_since_last_delinq_bin_tran,
aes(x=mths_since_last_delinq_bin_tran$mths_since_last_delinq_bin,
y=mths_since_last_delinq_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "No of months since last delinquency", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$mths_since_last_delinq_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$mths_since_last_delinq_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$mths_since_last_delinq_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$mths_since_last_delinq_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
p3 <- ggplot() +
geom_line(data=df1,aes(df1$mths_since_last_delinq_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$mths_since_last_delinq_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "No of months since last delinquency", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$mths_since_last_delinq_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$mths_since_last_delinq_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$mths_since_last_delinq_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$mths_since_last_delinq_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
retList <- list(p,p3)
return(retList)
}
bin_plot(15)
## [[1]]
##
## [[2]]
bin_plot(12)
## [[1]]
##
## [[2]]
bin_plot(6)
## [[1]]
##
## [[2]]
Inference -> We can observe a quite a zig-zag pattern when we have divided the varible for No of months since last delinquency in 10 bins. -> But as converge the number of bins we start to observe concave shaped line graph for the rate of charge offs. -> So wecan deduce that the buckets haveing highest and lowest No of months since last delinquency has slightly higher chances of charge offs.
#Interest received to date
summary(LoanStats2$total_rec_int)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 854.5 1702.4 2659.9 3369.8 23563.7
plot(LoanStats2$total_rec_int, ylab = "Interest received to date")
hist(LoanStats2$total_rec_int, breaks = 30,
xlab = "Interest received to date", col="#CCCCFF",
main="Histogram of Interest received to date")
a <- quantile(LoanStats2$total_rec_int,.90)
b <- quantile(LoanStats2$total_rec_int,0)
ind_la <- which(LoanStats2$total_rec_int < a & LoanStats2$total_rec_int > b)
LoanStats_la <- LoanStats2[ind_la,]
plot(LoanStats_la$total_rec_int, ylab = "Interest received to date")
hist(LoanStats_la$total_rec_int, breaks = 30,
xlab = "Interest received to date", col="#CCCCFF",
main="Histogram of Interest received to date")
summary(LoanStats2$total_rec_int)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 854.5 1702.4 2659.9 3369.8 23563.7
summary(LoanStats_la$total_rec_int)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.2 790.2 1498.3 1909.6 2693.1 6152.0
LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la[which(LoanStats_la$loan_status_new == 'Charged Off'),]$loan_amnt))
LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la[which(LoanStats_la$loan_status_new == 'Charged Off'),]))
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_la)/no_bins
unique(cut2(LoanStats_la$total_rec_int, m=no_cases_bin))
LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la$total_rec_int))
LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la))
LoanStats_la$total_rec_int_bin <- cut2(LoanStats_la$total_rec_int, m=no_cases_bin)
total_rec_int_bin <- sqldf('select total_rec_int_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_la group by total_rec_int_bin, loan_status_new')
total_rec_int_bin_tran <- reshape(total_rec_int_bin, idvar = "total_rec_int_bin",
timevar = "loan_status_new",
direction = "wide")
total_rec_int_bin_tran <-data.frame(total_rec_int_bin=total_rec_int_bin_tran$total_rec_int_bin,
charge_off_Cnts=total_rec_int_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=total_rec_int_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=total_rec_int_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=total_rec_int_bin_tran$`exposure.Fully Paid`,
annual_inc_total=total_rec_int_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=total_rec_int_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=total_rec_int_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(total_rec_int_bin_tran$`cnts.Charged Off`/(total_rec_int_bin_tran$`cnts.Charged Off`+
total_rec_int_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(total_rec_int_bin_tran$`exposure.Charged Off`/(total_rec_int_bin_tran$`exposure.Charged Off`+
total_rec_int_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(total_rec_int_bin_tran$`cnts.Charged Off`/total_rec_int_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(total_rec_int_bin_tran$`exposure.Charged Off`/total_rec_int_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- total_rec_int_bin_tran[,c("total_rec_int_bin","pct_cnts_grp")]
df2 <- total_rec_int_bin_tran[,c("total_rec_int_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=total_rec_int_bin_tran,
aes(x=total_rec_int_bin_tran$total_rec_int_bin,
y=total_rec_int_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Total Interest Received to date", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$total_rec_int_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$total_rec_int_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$total_rec_int_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$total_rec_int_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
p3 <- ggplot() +
geom_line(data=df1,aes(df1$total_rec_int_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$total_rec_int_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "Total Interest Received to date", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$total_rec_int_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$total_rec_int_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$total_rec_int_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$total_rec_int_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
retList <- list(p,p3)
return(retList)
}
bin_plot(12)
## [[1]]
##
## [[2]]
bin_plot(8)
## [[1]]
##
## [[2]]
bin_plot(6)
## [[1]]
##
## [[2]]
Inference -> Initial zig-zag pattern was observed in more number of bins. -> We can observe here that for Interest received to date for the first bin has maximum charge off and as we move along first three bins it gradually decreases. -> After reducing the bins, we can observe a trend of intial decrease in charge off rate as we move from lower interest received bins towards higher interest received bins but later it becomes almost constant. -> Based on what we have observed we can deduce that loans in highest interest received till date has maximum chances to charge off.
# total_rec_prncp
summary(LoanStats2$total_rec_prncp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 5261 10000 10944 15000 35000
plot(LoanStats2$total_rec_prncp, ylab = "Principal received to date")
hist(LoanStats2$total_rec_prncp, breaks = 20,
xlab = "Interest received to date", col="#CCCCFF",
main="Histogram of Interest received to date")
a <- quantile(LoanStats2$total_rec_prncp,.90)
b <- quantile(LoanStats2$total_rec_prncp,0)
ind_la <- which(LoanStats2$total_rec_prncp < a & LoanStats2$total_rec_prncp > b)
LoanStats_la <- LoanStats2[ind_la,]
plot(LoanStats_la$total_rec_prncp, ylab = "Principal received to date")
hist(LoanStats_la$total_rec_prncp, breaks = 10,
xlab = "Interest received to date", col="#CCCCFF",
main="Histogram of Principal received to date")
summary(LoanStats2$total_rec_prncp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 5261 10000 10944 15000 35000
summary(LoanStats_la$total_rec_prncp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32.51 5000.00 8927.02 9253.20 13000.00 20855.22
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_la)/no_bins
unique(cut2(LoanStats_la$total_rec_prncp, m=no_cases_bin))
LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la$total_rec_prncp))
LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la))
LoanStats_la$total_rec_prncp_bin <- cut2(LoanStats_la$total_rec_prncp, m=no_cases_bin)
total_rec_prncp_bin <- sqldf('select total_rec_prncp_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_la group by total_rec_prncp_bin, loan_status_new')
total_rec_prncp_bin_tran <- reshape(total_rec_prncp_bin, idvar = "total_rec_prncp_bin",
timevar = "loan_status_new",
direction = "wide")
total_rec_prncp_bin_tran <-data.frame(total_rec_prncp_bin=total_rec_prncp_bin_tran$total_rec_prncp_bin,
charge_off_Cnts=total_rec_prncp_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=total_rec_prncp_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=total_rec_prncp_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=total_rec_prncp_bin_tran$`exposure.Fully Paid`,
annual_inc_total=total_rec_prncp_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=total_rec_prncp_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=total_rec_prncp_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(total_rec_prncp_bin_tran$`cnts.Charged Off`/(total_rec_prncp_bin_tran$`cnts.Charged Off`+
total_rec_prncp_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(total_rec_prncp_bin_tran$`exposure.Charged Off`/(total_rec_prncp_bin_tran$`exposure.Charged Off`+
total_rec_prncp_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(total_rec_prncp_bin_tran$`cnts.Charged Off`/total_rec_prncp_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(total_rec_prncp_bin_tran$`exposure.Charged Off`/total_rec_prncp_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- total_rec_prncp_bin_tran[,c("total_rec_prncp_bin","pct_cnts_grp")]
df2 <- total_rec_prncp_bin_tran[,c("total_rec_prncp_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=total_rec_prncp_bin_tran,
aes(x=total_rec_prncp_bin_tran$total_rec_prncp_bin,
y=total_rec_prncp_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Principal received to date Bins", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$total_rec_prncp_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$total_rec_prncp_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$total_rec_prncp_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$total_rec_prncp_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
p3 <- ggplot() +
geom_line(data=df1,aes(df1$total_rec_prncp_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$total_rec_prncp_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "Principal received to date Bins", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$total_rec_prncp_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$total_rec_prncp_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$total_rec_prncp_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$total_rec_prncp_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
retList <- list(p,p3)
return(retList)
}
bin_plot(12)
## [[1]]
##
## [[2]]
bin_plot(10)
## [[1]]
##
## [[2]]
Inference -> It is quite evident from the line graph that maximum charge off rate is observed in the lowest bucket of the principal received till date. -> We also know that the during the loan tenure a individual pays minimum part of principal at the start of repayment schedule and which gradully increases so based on these 2 hypothesis we can extrapolate that the loans have a tendency to charge off in their early period of repayment schedule.
# total_pymnt_inv
summary(LoanStats2$total_pymnt_inv)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 6012 10965 13008 17675 58564
plot(LoanStats2$total_pymnt_inv, ylab = "Payments received to date for portion of total amount funded by investors")
hist(LoanStats2$total_pymnt_inv, breaks = 20,
xlab = "Payments received from investors", col="#CCCCFF",
main="Histogram of Payments received to date for portion of total amount funded by investors")
a <- quantile(LoanStats2$total_pymnt_inv,.80)
b <- quantile(LoanStats2$total_pymnt_inv,0)
ind_la <- which(LoanStats2$total_pymnt_inv < a & LoanStats2$total_pymnt_inv > b)
LoanStats_la <- LoanStats2[ind_la,]
plot(LoanStats_la$total_pymnt_inv, ylab = "Payments received to date for portion of total amount funded by investors")
hist(LoanStats_la$total_pymnt_inv, breaks = 10,
xlab = "Payments received from investors", col="#CCCCFF",
main="Histogram of Payments received to date for portion of total amount funded by investors")
summary(LoanStats2$total_pymnt_inv)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 6012 10965 13008 17675 58564
summary(LoanStats_la$total_pymnt_inv)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8 5233.8 8864.0 9274.3 13282.5 19890.5
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_la)/no_bins
unique(cut2(LoanStats_la$total_pymnt_inv, m=no_cases_bin))
LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la$total_pymnt_inv))
LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la))
LoanStats_la$totalpymntinv_bin <- cut2(LoanStats_la$total_pymnt_inv, m=no_cases_bin)
totalpymntinv_bin_tran <- sqldf('select totalpymntinv_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_la group by totalpymntinv_bin, loan_status_new')
totalpymntinv_bin_tran <- reshape(totalpymntinv_bin_tran , idvar = "totalpymntinv_bin",
timevar = "loan_status_new",
direction = "wide")
totalpymntinv_bin_tran <-data.frame(totalpymntinv_bin=totalpymntinv_bin_tran$totalpymntinv_bin,
charge_off_Cnts=totalpymntinv_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=totalpymntinv_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=totalpymntinv_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=totalpymntinv_bin_tran$`exposure.Fully Paid`,
annual_inc_total=totalpymntinv_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=totalpymntinv_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=totalpymntinv_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(totalpymntinv_bin_tran$`cnts.Charged Off`/(totalpymntinv_bin_tran$`cnts.Charged Off`+
totalpymntinv_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(totalpymntinv_bin_tran$`exposure.Charged Off`/(totalpymntinv_bin_tran$`exposure.Charged Off`+
totalpymntinv_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(totalpymntinv_bin_tran$`cnts.Charged Off`/totalpymntinv_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(totalpymntinv_bin_tran$`exposure.Charged Off`/totalpymntinv_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- totalpymntinv_bin_tran[,c("totalpymntinv_bin","pct_cnts_grp")]
df2 <- totalpymntinv_bin_tran[,c("totalpymntinv_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=totalpymntinv_bin_tran,
aes(x=totalpymntinv_bin_tran$totalpymntinv_bin,
y=totalpymntinv_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Payments received from investors Bins", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$totalpymntinv_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$totalpymntinv_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$totalpymntinv_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$totalpymntinv_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
p3 <- ggplot() +
geom_line(data=df1,aes(df1$totalpymntinv_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$totalpymntinv_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "Payments received from investors Bins", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$totalpymntinv_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$totalpymntinv_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$totalpymntinv_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$totalpymntinv_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
retList <- list(p,p3)
return(retList)
}
bin_plot(12)
## [[1]]
##
## [[2]]
bin_plot(10)
## [[1]]
##
## [[2]]
# The total number of credit lines currently in the borrower's credit file
ind <- which(is.na(LoanStats2$total_acc))
LoanStats3 <- LoanStats2[-ind,]
summary(LoanStats3$total_acc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 14.00 21.00 22.91 30.00 90.00
plot(LoanStats3$total_acc,
ylab = "Total number of credit lines")
hist(LoanStats3$total_acc, breaks = 30,
xlab = "Total number of credit lines", col="#CCCCFF",
main="Histogram of Total number of credit lines")
a <- quantile(LoanStats3$total_acc,.90)
b <- quantile(LoanStats3$total_acc,0)
ind_la <- which(LoanStats3$total_acc < a & LoanStats3$total_acc > b)
LoanStats_la <- LoanStats3[ind_la,]
plot(LoanStats_la$total_acc,
ylab = "Total number of credit lines")
hist(LoanStats_la$total_acc, breaks = 20,
xlab = "Total number of credit lines", col="#CCCCFF",
main="Histogram of Total number of credit lines")
summary(LoanStats3$total_acc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 14.00 21.00 22.91 30.00 90.00
summary(LoanStats_la$total_acc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 14.0 20.0 20.3 27.0 38.0
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_la)/no_bins
unique(cut2(LoanStats_la$total_acc, m=no_cases_bin))
LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la$total_acc))
LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la))
LoanStats_la$total_acc_bin <- cut2(LoanStats_la$total_acc, m=no_cases_bin)
total_acc_bin <- sqldf('select total_acc_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_la group by total_acc_bin, loan_status_new')
total_acc_bin_tran <- reshape(total_acc_bin, idvar = "total_acc_bin",
timevar = "loan_status_new",
direction = "wide")
total_acc_bin_tran <-data.frame(total_acc_bin=total_acc_bin_tran$total_acc_bin,
charge_off_Cnts=total_acc_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=total_acc_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=total_acc_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=total_acc_bin_tran$`exposure.Fully Paid`,
annual_inc_total=total_acc_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=total_acc_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=total_acc_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(total_acc_bin_tran$`cnts.Charged Off`/(total_acc_bin_tran$`cnts.Charged Off`+
total_acc_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(total_acc_bin_tran$`exposure.Charged Off`/(total_acc_bin_tran$`exposure.Charged Off`+
total_acc_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(total_acc_bin_tran$`cnts.Charged Off`/total_acc_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(total_acc_bin_tran$`exposure.Charged Off`/total_acc_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- total_acc_bin_tran[,c("total_acc_bin","pct_cnts_grp")]
df2 <- total_acc_bin_tran[,c("total_acc_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=total_acc_bin_tran,
aes(x=total_acc_bin_tran$total_acc_bin,
y=total_acc_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Total number of credit lines Bins", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$total_acc_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$total_acc_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$total_acc_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$total_acc_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
p3 <- ggplot() +
geom_line(data=df1,aes(df1$total_acc_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$total_acc_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "Total number of credit lines Bins", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$total_acc_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$total_acc_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$total_acc_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$total_acc_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
retList <- list(p,p3)
return(retList)
}
bin_plot(12)
## [[1]]
##
## [[2]]
bin_plot(10)
## [[1]]
##
## [[2]]
bin_plot(8)
## [[1]]
##
## [[2]]
Inference -> Intial inspection of the graph with 10 bins suggests a almost constant rate of charge off in all the bins. -> But as we reduce the number of bins we can observe the rate of charge off is maximum in the first bucket and after decreasing it becomes constant. -> So we can say that the individuals with lower number of credit lines hade more tendancy to charge off.
# Total credit revolving balance
summary(LoanStats2$revol_bal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 5459 10681 15231 18658 465731
plot(LoanStats2$revol_bal,
ylab = "Total credit revolving balance")
hist(LoanStats2$revol_bal, breaks = 30,
xlab = "Total credit revolving balance", col="#CCCCFF",
main="Histogram of Total credit revolving balance")
a <- quantile(LoanStats2$revol_bal,.75)
b <- quantile(LoanStats2$revol_bal,0)
ind_la <- which(LoanStats2$revol_bal < a & LoanStats2$revol_bal > b)
LoanStats_la <- LoanStats2[ind_la,]
plot(LoanStats_la$revol_bal,
ylab = "Total credit revolving balance")
hist(LoanStats_la$revol_bal, breaks = 20,
xlab = "Total credit revolving balance", col="#CCCCFF",
main="Histogram of Total credit revolving balance")
summary(LoanStats2$revol_bal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 5459 10681 15231 18658 465731
summary(LoanStats_la$revol_bal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 4396 8121 8479 12340 18651
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_la)/no_bins
unique(cut2(LoanStats_la$revol_bal, m=no_cases_bin))
LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la$revol_bal))
LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la))
LoanStats_la$revol_bal_bin <- cut2(LoanStats_la$revol_bal, m=no_cases_bin)
revol_bal_bin <- sqldf('select revol_bal_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_la group by revol_bal_bin, loan_status_new')
revol_bal_bin_tran <- reshape(revol_bal_bin, idvar = "revol_bal_bin",
timevar = "loan_status_new",
direction = "wide")
revol_bal_bin_tran <-data.frame(revol_bal_bin=revol_bal_bin_tran$revol_bal_bin,
charge_off_Cnts=revol_bal_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=revol_bal_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=revol_bal_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=revol_bal_bin_tran$`exposure.Fully Paid`,
annual_inc_total=revol_bal_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=revol_bal_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=revol_bal_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(revol_bal_bin_tran$`cnts.Charged Off`/(revol_bal_bin_tran$`cnts.Charged Off`+
revol_bal_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(revol_bal_bin_tran$`exposure.Charged Off`/(revol_bal_bin_tran$`exposure.Charged Off`+
revol_bal_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(revol_bal_bin_tran$`cnts.Charged Off`/revol_bal_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(revol_bal_bin_tran$`exposure.Charged Off`/revol_bal_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- revol_bal_bin_tran[,c("revol_bal_bin","pct_cnts_grp")]
df2 <- revol_bal_bin_tran[,c("revol_bal_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=revol_bal_bin_tran,
aes(x=revol_bal_bin_tran$revol_bal_bin,
y=revol_bal_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Total credit revolving balance Bins", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$revol_bal_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$revol_bal_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$revol_bal_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$revol_bal_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
p3 <- ggplot() +
geom_line(data=df1,aes(df1$revol_bal_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$revol_bal_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "Total credit revolving balance Bins", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$revol_bal_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$revol_bal_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$revol_bal_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$revol_bal_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
retList <- list(p,p3)
return(retList)
}
bin_plot(12)
## [[1]]
##
## [[2]]
bin_plot(10)
## [[1]]
##
## [[2]]
bin_plot(8)
## [[1]]
##
## [[2]]
Inference -> Almost no pattern observed between Total credit revolving balance and rate of charge off.
# Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.
ind <- which(is.na(LoanStats2$revol_util_num))
LoanStats3 <- LoanStats2[-ind,]
#cleaning the percentage which is above 100
ind <- which(LoanStats3$revol_util_num > 100)
LoanStats3 <- LoanStats3[-ind,]
summary(LoanStats3$revol_util_num)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 34.30 55.60 54.35 75.70 99.90
plot(LoanStats3$revol_util_num,
ylab = "Revolving line utilization rate")
hist(LoanStats3$revol_util_num, breaks = 20,
xlab = "Revolving line utilization rate", col="#CCCCFF",
main="Histogram of Revolving line utilization rate")
a <- quantile(LoanStats3$revol_util_num,1)
b <- quantile(LoanStats3$revol_util_num,.1)
ind_la <- which(LoanStats3$revol_util_num < a & LoanStats3$revol_util_num > b)
LoanStats_la <- LoanStats3[ind_la,]
plot(LoanStats_la$revol_util_num,
ylab = "Revolving line utilization rate")
hist(LoanStats_la$revol_util_num, breaks = 30,
xlab = "Revolving line utilization rate", col="#CCCCFF",
main="Histogram of Revolving line utilization rate")
summary(LoanStats3$revol_util_num)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 34.30 55.60 54.35 75.70 99.90
summary(LoanStats_la$revol_util_num)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.90 41.10 59.40 59.38 77.85 99.80
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_la)/no_bins
unique(cut2(LoanStats_la$revol_util_num, m=no_cases_bin))
LoanStats_la$tot_loan_amt <- as.numeric(sum(LoanStats_la$revol_util_num))
LoanStats_la$tot_loan_cnt <- as.numeric(nrow(LoanStats_la))
LoanStats_la$revol_util_num_bin <- cut2(LoanStats_la$revol_util_num, m=no_cases_bin)
revol_util_num_bin <- sqldf('select revol_util_num_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_la group by revol_util_num_bin, loan_status_new')
revol_util_num_bin_tran <- reshape(revol_util_num_bin, idvar = "revol_util_num_bin",
timevar = "loan_status_new",
direction = "wide")
revol_util_num_bin_tran <-data.frame(revol_util_num_bin=revol_util_num_bin_tran$revol_util_num_bin,
charge_off_Cnts=revol_util_num_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=revol_util_num_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=revol_util_num_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=revol_util_num_bin_tran$`exposure.Fully Paid`,
annual_inc_total=revol_util_num_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=revol_util_num_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=revol_util_num_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(revol_util_num_bin_tran$`cnts.Charged Off`/(revol_util_num_bin_tran$`cnts.Charged Off`+
revol_util_num_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(revol_util_num_bin_tran$`exposure.Charged Off`/(revol_util_num_bin_tran$`exposure.Charged Off`+
revol_util_num_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(revol_util_num_bin_tran$`cnts.Charged Off`/revol_util_num_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(revol_util_num_bin_tran$`exposure.Charged Off`/revol_util_num_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- revol_util_num_bin_tran[,c("revol_util_num_bin","pct_cnts_grp")]
df2 <- revol_util_num_bin_tran[,c("revol_util_num_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=revol_util_num_bin_tran,
aes(x=revol_util_num_bin_tran$revol_util_num_bin,
y=revol_util_num_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Revolving line utilization rate Bins", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$revol_util_num_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$revol_util_num_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$revol_util_num_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$revol_util_num_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
p3 <- ggplot() +
geom_line(data=df1,aes(df1$revol_util_num_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$revol_util_num_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "Revolving line utilization rate Bins", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$revol_util_num_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$revol_util_num_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$revol_util_num_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$revol_util_num_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
retList <- list(p,p3)
return(retList)
}
bin_plot(12)
## [[1]]
##
## [[2]]
bin_plot(10)
## [[1]]
##
## [[2]]
Inference -> This is quite clear here we observe that as we move towards higher revolving utilization rate charge off rate increases. -> We can deduce that the individual who have used more there credit lines tends to charge off more.
unique(LoanStats2$term)
## [1] "60 months" "36 months"
library(gmodels)
aaa <- data.frame(CrossTable(LoanStats2$term,LoanStats2$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 19776
##
##
## | LoanStats2$loan_status_new
## LoanStats2$term | Charged Off | Fully Paid | Row Total |
## ----------------|-------------|-------------|-------------|
## 36 months | 1715 | 12394 | 14109 |
## | 0.122 | 0.878 | 0.713 |
## ----------------|-------------|-------------|-------------|
## 60 months | 1369 | 4298 | 5667 |
## | 0.242 | 0.758 | 0.287 |
## ----------------|-------------|-------------|-------------|
## Column Total | 3084 | 16692 | 19776 |
## ----------------|-------------|-------------|-------------|
##
##
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x",
timevar = "aaa.t.y",
direction = "wide")
p <- ggplot()+geom_bar(data=c,
aes(x=c$aaa.t.x,
y=c$`aaa.t.Freq.Charged Off`),
stat="identity",fill = "#CCCCFF", colour="black") +
labs(x = "Loan Term", y = "")
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: % Charge off in Bin
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: Legend
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range>
## reset: function
## scale_name: manual
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x,
c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
Inference -> This is evident from the graph despite having large number of charge offs in loans with 36 month term the loans in 60 month term tends to charge off more. -> Longer term duration increases the chances of charge off.
library(gmodels)
#grade vs loan status
unique(LoanStats2$grade)
## [1] "C" "B" "D" "A" "E" "F" "G"
aaa <- data.frame(CrossTable(LoanStats2$grade,LoanStats2$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 19776
##
##
## | LoanStats2$loan_status_new
## LoanStats2$grade | Charged Off | Fully Paid | Row Total |
## -----------------|-------------|-------------|-------------|
## A | 245 | 3644 | 3889 |
## | 0.063 | 0.937 | 0.197 |
## -----------------|-------------|-------------|-------------|
## B | 670 | 5088 | 5758 |
## | 0.116 | 0.884 | 0.291 |
## -----------------|-------------|-------------|-------------|
## C | 690 | 3503 | 4193 |
## | 0.165 | 0.835 | 0.212 |
## -----------------|-------------|-------------|-------------|
## D | 673 | 2453 | 3126 |
## | 0.215 | 0.785 | 0.158 |
## -----------------|-------------|-------------|-------------|
## E | 477 | 1344 | 1821 |
## | 0.262 | 0.738 | 0.092 |
## -----------------|-------------|-------------|-------------|
## F | 243 | 480 | 723 |
## | 0.336 | 0.664 | 0.037 |
## -----------------|-------------|-------------|-------------|
## G | 86 | 180 | 266 |
## | 0.323 | 0.677 | 0.013 |
## -----------------|-------------|-------------|-------------|
## Column Total | 3084 | 16692 | 19776 |
## -----------------|-------------|-------------|-------------|
##
##
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x",
timevar = "aaa.t.y",
direction = "wide")
p <- ggplot()+geom_bar(data=c,
aes(x=c$aaa.t.x,
y=c$`aaa.t.Freq.Charged Off`),
stat="identity",fill = "#CCCCFF", colour="black") +
labs(x = "Loan Grade", y = "")
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: % Charge off in Bin
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: Legend
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range>
## reset: function
## scale_name: manual
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x,
c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
Inference -> As expected the charge off rate increases aswe move towards lower Loan grade.
library(gmodels)
#grade vs loan status
unique(LoanStats2$sub_grade)
## [1] "C5" "C3" "B5" "C1" "D1" "A1" "C4" "B4" "B3" "C2" "A3" "A4" "A5" "D2"
## [15] "A2" "B1" "B2" "E4" "D3" "F3" "E3" "D4" "F1" "E1" "E5" "G4" "E2" "D5"
## [29] "F2" "G3" "F5" "G1" "F4" "G2" "G5"
aaa <- data.frame(CrossTable(LoanStats2$sub_grade,LoanStats2$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 19776
##
##
## | LoanStats2$loan_status_new
## LoanStats2$sub_grade | Charged Off | Fully Paid | Row Total |
## ---------------------|-------------|-------------|-------------|
## A1 | 10 | 353 | 363 |
## | 0.028 | 0.972 | 0.018 |
## ---------------------|-------------|-------------|-------------|
## A2 | 34 | 475 | 509 |
## | 0.067 | 0.933 | 0.026 |
## ---------------------|-------------|-------------|-------------|
## A3 | 43 | 611 | 654 |
## | 0.066 | 0.934 | 0.033 |
## ---------------------|-------------|-------------|-------------|
## A4 | 77 | 1128 | 1205 |
## | 0.064 | 0.936 | 0.061 |
## ---------------------|-------------|-------------|-------------|
## A5 | 81 | 1077 | 1158 |
## | 0.070 | 0.930 | 0.059 |
## ---------------------|-------------|-------------|-------------|
## B1 | 69 | 750 | 819 |
## | 0.084 | 0.916 | 0.041 |
## ---------------------|-------------|-------------|-------------|
## B2 | 99 | 876 | 975 |
## | 0.102 | 0.898 | 0.049 |
## ---------------------|-------------|-------------|-------------|
## B3 | 166 | 1193 | 1359 |
## | 0.122 | 0.878 | 0.069 |
## ---------------------|-------------|-------------|-------------|
## B4 | 150 | 1080 | 1230 |
## | 0.122 | 0.878 | 0.062 |
## ---------------------|-------------|-------------|-------------|
## B5 | 186 | 1189 | 1375 |
## | 0.135 | 0.865 | 0.070 |
## ---------------------|-------------|-------------|-------------|
## C1 | 150 | 911 | 1061 |
## | 0.141 | 0.859 | 0.054 |
## ---------------------|-------------|-------------|-------------|
## C2 | 171 | 888 | 1059 |
## | 0.161 | 0.839 | 0.054 |
## ---------------------|-------------|-------------|-------------|
## C3 | 148 | 653 | 801 |
## | 0.185 | 0.815 | 0.041 |
## ---------------------|-------------|-------------|-------------|
## C4 | 119 | 539 | 658 |
## | 0.181 | 0.819 | 0.033 |
## ---------------------|-------------|-------------|-------------|
## C5 | 102 | 512 | 614 |
## | 0.166 | 0.834 | 0.031 |
## ---------------------|-------------|-------------|-------------|
## D1 | 102 | 433 | 535 |
## | 0.191 | 0.809 | 0.027 |
## ---------------------|-------------|-------------|-------------|
## D2 | 160 | 623 | 783 |
## | 0.204 | 0.796 | 0.040 |
## ---------------------|-------------|-------------|-------------|
## D3 | 144 | 538 | 682 |
## | 0.211 | 0.789 | 0.034 |
## ---------------------|-------------|-------------|-------------|
## D4 | 143 | 439 | 582 |
## | 0.246 | 0.754 | 0.029 |
## ---------------------|-------------|-------------|-------------|
## D5 | 124 | 420 | 544 |
## | 0.228 | 0.772 | 0.028 |
## ---------------------|-------------|-------------|-------------|
## E1 | 126 | 352 | 478 |
## | 0.264 | 0.736 | 0.024 |
## ---------------------|-------------|-------------|-------------|
## E2 | 117 | 316 | 433 |
## | 0.270 | 0.730 | 0.022 |
## ---------------------|-------------|-------------|-------------|
## E3 | 84 | 273 | 357 |
## | 0.235 | 0.765 | 0.018 |
## ---------------------|-------------|-------------|-------------|
## E4 | 81 | 207 | 288 |
## | 0.281 | 0.719 | 0.015 |
## ---------------------|-------------|-------------|-------------|
## E5 | 69 | 196 | 265 |
## | 0.260 | 0.740 | 0.013 |
## ---------------------|-------------|-------------|-------------|
## F1 | 72 | 153 | 225 |
## | 0.320 | 0.680 | 0.011 |
## ---------------------|-------------|-------------|-------------|
## F2 | 51 | 116 | 167 |
## | 0.305 | 0.695 | 0.008 |
## ---------------------|-------------|-------------|-------------|
## F3 | 38 | 92 | 130 |
## | 0.292 | 0.708 | 0.007 |
## ---------------------|-------------|-------------|-------------|
## F4 | 47 | 71 | 118 |
## | 0.398 | 0.602 | 0.006 |
## ---------------------|-------------|-------------|-------------|
## F5 | 35 | 48 | 83 |
## | 0.422 | 0.578 | 0.004 |
## ---------------------|-------------|-------------|-------------|
## G1 | 24 | 51 | 75 |
## | 0.320 | 0.680 | 0.004 |
## ---------------------|-------------|-------------|-------------|
## G2 | 20 | 37 | 57 |
## | 0.351 | 0.649 | 0.003 |
## ---------------------|-------------|-------------|-------------|
## G3 | 20 | 28 | 48 |
## | 0.417 | 0.583 | 0.002 |
## ---------------------|-------------|-------------|-------------|
## G4 | 8 | 39 | 47 |
## | 0.170 | 0.830 | 0.002 |
## ---------------------|-------------|-------------|-------------|
## G5 | 14 | 25 | 39 |
## | 0.359 | 0.641 | 0.002 |
## ---------------------|-------------|-------------|-------------|
## Column Total | 3084 | 16692 | 19776 |
## ---------------------|-------------|-------------|-------------|
##
##
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x",
timevar = "aaa.t.y",
direction = "wide")
p <- ggplot()+geom_bar(data=c,
aes(x=c$aaa.t.x,
y=c$`aaa.t.Freq.Charged Off`),
stat="identity",fill = "#CCCCFF", colour="black") +
labs(x = "Loan Sub Grade", y = "")
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: % Charge off in Bin
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: Legend
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range>
## reset: function
## scale_name: manual
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x,
c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
library(gmodels)
#Employment length vs loan status
unique(LoanStats2$emp_length)
## [1] "8 years" "5 years" "10+ years" "< 1 year" "6 years"
## [6] "3 years" "9 years" "7 years" "2 years" "1 year"
## [11] "4 years" "n/a"
CrossTable(LoanStats2$emp_length,LoanStats2$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 19776
##
##
## | LoanStats2$loan_status_new
## LoanStats2$emp_length | Charged Off | Fully Paid | Row Total |
## ----------------------|-------------|-------------|-------------|
## < 1 year | 334 | 1900 | 2234 |
## | 0.150 | 0.850 | 0.113 |
## ----------------------|-------------|-------------|-------------|
## 1 year | 238 | 1348 | 1586 |
## | 0.150 | 0.850 | 0.080 |
## ----------------------|-------------|-------------|-------------|
## 10+ years | 775 | 3691 | 4466 |
## | 0.174 | 0.826 | 0.226 |
## ----------------------|-------------|-------------|-------------|
## 2 years | 312 | 1890 | 2202 |
## | 0.142 | 0.858 | 0.111 |
## ----------------------|-------------|-------------|-------------|
## 3 years | 295 | 1727 | 2022 |
## | 0.146 | 0.854 | 0.102 |
## ----------------------|-------------|-------------|-------------|
## 4 years | 244 | 1463 | 1707 |
## | 0.143 | 0.857 | 0.086 |
## ----------------------|-------------|-------------|-------------|
## 5 years | 239 | 1402 | 1641 |
## | 0.146 | 0.854 | 0.083 |
## ----------------------|-------------|-------------|-------------|
## 6 years | 179 | 975 | 1154 |
## | 0.155 | 0.845 | 0.058 |
## ----------------------|-------------|-------------|-------------|
## 7 years | 148 | 760 | 908 |
## | 0.163 | 0.837 | 0.046 |
## ----------------------|-------------|-------------|-------------|
## 8 years | 127 | 668 | 795 |
## | 0.160 | 0.840 | 0.040 |
## ----------------------|-------------|-------------|-------------|
## 9 years | 99 | 549 | 648 |
## | 0.153 | 0.847 | 0.033 |
## ----------------------|-------------|-------------|-------------|
## n/a | 94 | 319 | 413 |
## | 0.228 | 0.772 | 0.021 |
## ----------------------|-------------|-------------|-------------|
## Column Total | 3084 | 16692 | 19776 |
## ----------------------|-------------|-------------|-------------|
##
##
ind_na <- which(LoanStats2$emp_length != "n/a")
LoanStats_el <- LoanStats2[ind_na,]
unique(LoanStats_el$emp_length)
## [1] "8 years" "5 years" "10+ years" "< 1 year" "6 years"
## [6] "3 years" "9 years" "7 years" "2 years" "1 year"
## [11] "4 years"
aaa <- data.frame(CrossTable(LoanStats_el$emp_length,LoanStats_el$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 19363
##
##
## | LoanStats_el$loan_status_new
## LoanStats_el$emp_length | Charged Off | Fully Paid | Row Total |
## ------------------------|-------------|-------------|-------------|
## < 1 year | 334 | 1900 | 2234 |
## | 0.150 | 0.850 | 0.115 |
## ------------------------|-------------|-------------|-------------|
## 1 year | 238 | 1348 | 1586 |
## | 0.150 | 0.850 | 0.082 |
## ------------------------|-------------|-------------|-------------|
## 10+ years | 775 | 3691 | 4466 |
## | 0.174 | 0.826 | 0.231 |
## ------------------------|-------------|-------------|-------------|
## 2 years | 312 | 1890 | 2202 |
## | 0.142 | 0.858 | 0.114 |
## ------------------------|-------------|-------------|-------------|
## 3 years | 295 | 1727 | 2022 |
## | 0.146 | 0.854 | 0.104 |
## ------------------------|-------------|-------------|-------------|
## 4 years | 244 | 1463 | 1707 |
## | 0.143 | 0.857 | 0.088 |
## ------------------------|-------------|-------------|-------------|
## 5 years | 239 | 1402 | 1641 |
## | 0.146 | 0.854 | 0.085 |
## ------------------------|-------------|-------------|-------------|
## 6 years | 179 | 975 | 1154 |
## | 0.155 | 0.845 | 0.060 |
## ------------------------|-------------|-------------|-------------|
## 7 years | 148 | 760 | 908 |
## | 0.163 | 0.837 | 0.047 |
## ------------------------|-------------|-------------|-------------|
## 8 years | 127 | 668 | 795 |
## | 0.160 | 0.840 | 0.041 |
## ------------------------|-------------|-------------|-------------|
## 9 years | 99 | 549 | 648 |
## | 0.153 | 0.847 | 0.033 |
## ------------------------|-------------|-------------|-------------|
## Column Total | 2990 | 16373 | 19363 |
## ------------------------|-------------|-------------|-------------|
##
##
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x",
timevar = "aaa.t.y",
direction = "wide")
p <- ggplot()+geom_bar(data=c,
aes(x=c$aaa.t.x,
y=c$`aaa.t.Freq.Charged Off`),
stat="identity",fill = "#CCCCFF", colour="black") +
labs(x = "Length of Employment", y = "")
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: % Charge off in Bin
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: Legend
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range>
## reset: function
## scale_name: manual
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x,
c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
library(gmodels)
#The home ownership status vs loan status
unique(LoanStats2$home_ownership)
## [1] "RENT" "OWN" "MORTGAGE" "OTHER" "NONE"
CrossTable(LoanStats2$home_ownership,LoanStats2$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 19776
##
##
## | LoanStats2$loan_status_new
## LoanStats2$home_ownership | Charged Off | Fully Paid | Row Total |
## --------------------------|-------------|-------------|-------------|
## MORTGAGE | 1286 | 7022 | 8308 |
## | 0.155 | 0.845 | 0.420 |
## --------------------------|-------------|-------------|-------------|
## NONE | 0 | 1 | 1 |
## | 0.000 | 1.000 | 0.000 |
## --------------------------|-------------|-------------|-------------|
## OTHER | 12 | 40 | 52 |
## | 0.231 | 0.769 | 0.003 |
## --------------------------|-------------|-------------|-------------|
## OWN | 209 | 1156 | 1365 |
## | 0.153 | 0.847 | 0.069 |
## --------------------------|-------------|-------------|-------------|
## RENT | 1577 | 8473 | 10050 |
## | 0.157 | 0.843 | 0.508 |
## --------------------------|-------------|-------------|-------------|
## Column Total | 3084 | 16692 | 19776 |
## --------------------------|-------------|-------------|-------------|
##
##
ind_na <- which(LoanStats2$home_ownership != "NONE")
LoanStats_el <- LoanStats2[ind_na,]
unique(LoanStats_el$home_ownership)
## [1] "RENT" "OWN" "MORTGAGE" "OTHER"
aaa <- data.frame(CrossTable(LoanStats_el$home_ownership,LoanStats_el$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 19775
##
##
## | LoanStats_el$loan_status_new
## LoanStats_el$home_ownership | Charged Off | Fully Paid | Row Total |
## ----------------------------|-------------|-------------|-------------|
## MORTGAGE | 1286 | 7022 | 8308 |
## | 0.155 | 0.845 | 0.420 |
## ----------------------------|-------------|-------------|-------------|
## OTHER | 12 | 40 | 52 |
## | 0.231 | 0.769 | 0.003 |
## ----------------------------|-------------|-------------|-------------|
## OWN | 209 | 1156 | 1365 |
## | 0.153 | 0.847 | 0.069 |
## ----------------------------|-------------|-------------|-------------|
## RENT | 1577 | 8473 | 10050 |
## | 0.157 | 0.843 | 0.508 |
## ----------------------------|-------------|-------------|-------------|
## Column Total | 3084 | 16691 | 19775 |
## ----------------------------|-------------|-------------|-------------|
##
##
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x",
timevar = "aaa.t.y",
direction = "wide")
p <- ggplot()+geom_bar(data=c,
aes(x=c$aaa.t.x,
y=c$`aaa.t.Freq.Charged Off`),
stat="identity",fill = "#CCCCFF", colour="black") +
labs(x = "The home ownership status", y = "")
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: % Charge off in Bin
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: Legend
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range>
## reset: function
## scale_name: manual
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x,
c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
library(gmodels)
#verification_status vs loan status
unique(LoanStats2$verification_status)
## [1] "Not Verified" "Source Verified" "Verified"
aaa <- data.frame(CrossTable(LoanStats2$verification_status,LoanStats2$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 19776
##
##
## | LoanStats2$loan_status_new
## LoanStats2$verification_status | Charged Off | Fully Paid | Row Total |
## -------------------------------|-------------|-------------|-------------|
## Not Verified | 1131 | 6907 | 8038 |
## | 0.141 | 0.859 | 0.406 |
## -------------------------------|-------------|-------------|-------------|
## Source Verified | 728 | 3985 | 4713 |
## | 0.154 | 0.846 | 0.238 |
## -------------------------------|-------------|-------------|-------------|
## Verified | 1225 | 5800 | 7025 |
## | 0.174 | 0.826 | 0.355 |
## -------------------------------|-------------|-------------|-------------|
## Column Total | 3084 | 16692 | 19776 |
## -------------------------------|-------------|-------------|-------------|
##
##
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x",
timevar = "aaa.t.y",
direction = "wide")
p <- ggplot()+geom_bar(data=c,
aes(x=c$aaa.t.x,
y=c$`aaa.t.Freq.Charged Off`),
stat="identity",fill = "#CCCCFF", colour="black") +
labs(x = "Verification status", y = "")
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: % Charge off in Bin
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: Legend
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range>
## reset: function
## scale_name: manual
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x,
c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
library(gmodels)
#Address State vs loan status
unique(LoanStats2$addr_state)
## [1] "NC" "AZ" "CA" "VA" "MO" "TX" "FL" "NY" "NJ" "OR" "OH" "IL" "RI" "LA"
## [15] "CT" "WA" "WI" "MN" "AL" "KS" "NV" "PA" "GA" "VT" "MI" "NH" "AR" "MD"
## [29] "NM" "CO" "MT" "WV" "MA" "AK" "SC" "WY" "KY" "HI" "OK" "UT" "DC" "SD"
## [43] "DE" "IA" "MS" "TN" "NE" "ID" "IN"
aaa <- data.frame(CrossTable(LoanStats2$addr_state,LoanStats2$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 19776
##
##
## | LoanStats2$loan_status_new
## LoanStats2$addr_state | Charged Off | Fully Paid | Row Total |
## ----------------------|-------------|-------------|-------------|
## AK | 7 | 27 | 34 |
## | 0.206 | 0.794 | 0.002 |
## ----------------------|-------------|-------------|-------------|
## AL | 28 | 213 | 241 |
## | 0.116 | 0.884 | 0.012 |
## ----------------------|-------------|-------------|-------------|
## AR | 10 | 100 | 110 |
## | 0.091 | 0.909 | 0.006 |
## ----------------------|-------------|-------------|-------------|
## AZ | 65 | 317 | 382 |
## | 0.170 | 0.830 | 0.019 |
## ----------------------|-------------|-------------|-------------|
## CA | 590 | 2984 | 3574 |
## | 0.165 | 0.835 | 0.181 |
## ----------------------|-------------|-------------|-------------|
## CO | 59 | 347 | 406 |
## | 0.145 | 0.855 | 0.021 |
## ----------------------|-------------|-------------|-------------|
## CT | 54 | 328 | 382 |
## | 0.141 | 0.859 | 0.019 |
## ----------------------|-------------|-------------|-------------|
## DC | 7 | 109 | 116 |
## | 0.060 | 0.940 | 0.006 |
## ----------------------|-------------|-------------|-------------|
## DE | 3 | 48 | 51 |
## | 0.059 | 0.941 | 0.003 |
## ----------------------|-------------|-------------|-------------|
## FL | 255 | 1102 | 1357 |
## | 0.188 | 0.812 | 0.069 |
## ----------------------|-------------|-------------|-------------|
## GA | 103 | 566 | 669 |
## | 0.154 | 0.846 | 0.034 |
## ----------------------|-------------|-------------|-------------|
## HI | 10 | 75 | 85 |
## | 0.118 | 0.882 | 0.004 |
## ----------------------|-------------|-------------|-------------|
## IA | 1 | 6 | 7 |
## | 0.143 | 0.857 | 0.000 |
## ----------------------|-------------|-------------|-------------|
## ID | 0 | 4 | 4 |
## | 0.000 | 1.000 | 0.000 |
## ----------------------|-------------|-------------|-------------|
## IL | 111 | 649 | 760 |
## | 0.146 | 0.854 | 0.038 |
## ----------------------|-------------|-------------|-------------|
## IN | 5 | 7 | 12 |
## | 0.417 | 0.583 | 0.001 |
## ----------------------|-------------|-------------|-------------|
## KS | 12 | 129 | 141 |
## | 0.085 | 0.915 | 0.007 |
## ----------------------|-------------|-------------|-------------|
## KY | 33 | 142 | 175 |
## | 0.189 | 0.811 | 0.009 |
## ----------------------|-------------|-------------|-------------|
## LA | 21 | 174 | 195 |
## | 0.108 | 0.892 | 0.010 |
## ----------------------|-------------|-------------|-------------|
## MA | 83 | 621 | 704 |
## | 0.118 | 0.882 | 0.036 |
## ----------------------|-------------|-------------|-------------|
## MD | 97 | 444 | 541 |
## | 0.179 | 0.821 | 0.027 |
## ----------------------|-------------|-------------|-------------|
## MI | 55 | 271 | 326 |
## | 0.169 | 0.831 | 0.016 |
## ----------------------|-------------|-------------|-------------|
## MN | 45 | 253 | 298 |
## | 0.151 | 0.849 | 0.015 |
## ----------------------|-------------|-------------|-------------|
## MO | 69 | 301 | 370 |
## | 0.186 | 0.814 | 0.019 |
## ----------------------|-------------|-------------|-------------|
## MS | 1 | 6 | 7 |
## | 0.143 | 0.857 | 0.000 |
## ----------------------|-------------|-------------|-------------|
## MT | 8 | 29 | 37 |
## | 0.216 | 0.784 | 0.002 |
## ----------------------|-------------|-------------|-------------|
## NC | 71 | 322 | 393 |
## | 0.181 | 0.819 | 0.020 |
## ----------------------|-------------|-------------|-------------|
## NE | 3 | 4 | 7 |
## | 0.429 | 0.571 | 0.000 |
## ----------------------|-------------|-------------|-------------|
## NH | 15 | 84 | 99 |
## | 0.152 | 0.848 | 0.005 |
## ----------------------|-------------|-------------|-------------|
## NJ | 163 | 827 | 990 |
## | 0.165 | 0.835 | 0.050 |
## ----------------------|-------------|-------------|-------------|
## NM | 20 | 87 | 107 |
## | 0.187 | 0.813 | 0.005 |
## ----------------------|-------------|-------------|-------------|
## NV | 54 | 173 | 227 |
## | 0.238 | 0.762 | 0.011 |
## ----------------------|-------------|-------------|-------------|
## NY | 291 | 1650 | 1941 |
## | 0.150 | 0.850 | 0.098 |
## ----------------------|-------------|-------------|-------------|
## OH | 91 | 538 | 629 |
## | 0.145 | 0.855 | 0.032 |
## ----------------------|-------------|-------------|-------------|
## OK | 27 | 109 | 136 |
## | 0.199 | 0.801 | 0.007 |
## ----------------------|-------------|-------------|-------------|
## OR | 43 | 203 | 246 |
## | 0.175 | 0.825 | 0.012 |
## ----------------------|-------------|-------------|-------------|
## PA | 95 | 637 | 732 |
## | 0.130 | 0.870 | 0.037 |
## ----------------------|-------------|-------------|-------------|
## RI | 19 | 91 | 110 |
## | 0.173 | 0.827 | 0.006 |
## ----------------------|-------------|-------------|-------------|
## SC | 34 | 169 | 203 |
## | 0.167 | 0.833 | 0.010 |
## ----------------------|-------------|-------------|-------------|
## SD | 7 | 23 | 30 |
## | 0.233 | 0.767 | 0.002 |
## ----------------------|-------------|-------------|-------------|
## TN | 5 | 4 | 9 |
## | 0.556 | 0.444 | 0.000 |
## ----------------------|-------------|-------------|-------------|
## TX | 166 | 1095 | 1261 |
## | 0.132 | 0.868 | 0.064 |
## ----------------------|-------------|-------------|-------------|
## UT | 18 | 114 | 132 |
## | 0.136 | 0.864 | 0.007 |
## ----------------------|-------------|-------------|-------------|
## VA | 98 | 617 | 715 |
## | 0.137 | 0.863 | 0.036 |
## ----------------------|-------------|-------------|-------------|
## VT | 5 | 23 | 28 |
## | 0.179 | 0.821 | 0.001 |
## ----------------------|-------------|-------------|-------------|
## WA | 73 | 348 | 421 |
## | 0.173 | 0.827 | 0.021 |
## ----------------------|-------------|-------------|-------------|
## WI | 43 | 197 | 240 |
## | 0.179 | 0.821 | 0.012 |
## ----------------------|-------------|-------------|-------------|
## WV | 10 | 88 | 98 |
## | 0.102 | 0.898 | 0.005 |
## ----------------------|-------------|-------------|-------------|
## WY | 1 | 37 | 38 |
## | 0.026 | 0.974 | 0.002 |
## ----------------------|-------------|-------------|-------------|
## Column Total | 3084 | 16692 | 19776 |
## ----------------------|-------------|-------------|-------------|
##
##
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x",
timevar = "aaa.t.y",
direction = "wide")
p <- ggplot()+geom_bar(data=c,
aes(x=c$aaa.t.x,
y=c$`aaa.t.Freq.Charged Off`),
stat="identity",fill = "#CCCCFF", colour="black") +
labs(x = "Address State", y = "")
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: % Charge off in Bin
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: Legend
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range>
## reset: function
## scale_name: manual
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x,
c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
#Issue Date Analysis
ss = strsplit(LoanStats2$issue_d,'-')
LoanStats2$issue_m <- sapply(ss,function(x)x[1])
LoanStats2$issue_y <- sapply(ss,function(x)x[2])
LoanStats_issued <- sqldf('select issue_d,
loan_status_new, count(*) as cnts, sum(loan_amnt) as total_exposure
from LoanStats2 group by issue_d')
issue_d_tran_d <- reshape(LoanStats_issued, idvar = "issue_d",
timevar = "loan_status_new",
direction = "wide")
issue_d_tran_d
## issue_d cnts.Fully Paid total_exposure.Fully Paid cnts.Charged Off
## 1 Apr-2008 114 1063575 NA
## 2 Apr-2009 102 999375 NA
## 3 Apr-2010 395 4825825 NA
## 4 Apr-2011 686 8952575 NA
## 5 Aug-2007 NA NA 17
## 6 Aug-2008 24 156150 NA
## 7 Aug-2009 NA NA 196
## 8 Aug-2010 579 6946800 NA
## 9 Aug-2011 935 12749850 NA
## 10 Dec-2007 NA NA 69
## 11 Dec-2008 NA NA 96
## 12 Dec-2009 280 3406650 NA
## 13 Dec-2010 637 7487575 NA
## 14 Dec-2011 1215 18242125 NA
## 15 Feb-2008 NA NA 160
## 16 Feb-2009 117 1122850 NA
## 17 Feb-2010 307 3719925 NA
## 18 Feb-2011 NA NA 670
## 19 Jan-2008 136 1445325 NA
## 20 Jan-2009 NA NA 120
## 21 Jan-2010 294 3603625 NA
## 22 Jan-2011 718 8489150 NA
## 23 Jul-2007 16 80075 NA
## 24 Jul-2008 41 265850 NA
## 25 Jul-2009 176 1857625 NA
## 26 Jul-2010 527 6059000 NA
## 27 Jul-2011 875 11251800 NA
## 28 Jun-2007 3 17900 NA
## 29 Jun-2008 36 218025 NA
## 30 Jun-2009 NA NA 154
## 31 Jun-2010 462 5550350 NA
## 32 Jun-2011 832 11054675 NA
## 33 Mar-2008 184 1988000 NA
## 34 Mar-2009 127 1415050 NA
## 35 Mar-2010 361 4258175 NA
## 36 Mar-2011 647 8068125 NA
## 37 May-2008 44 248975 NA
## 38 May-2009 121 1380225 NA
## 39 May-2010 448 5633350 NA
## 40 May-2011 NA NA 788
## 41 Nov-2007 41 345800 NA
## 42 Nov-2008 84 927775 NA
## 43 Nov-2009 307 3409950 NA
## 44 Nov-2010 548 6734600 NA
## 45 Nov-2011 1201 16835100 NA
## 46 Oct-2007 NA NA 37
## 47 Oct-2008 37 317450 NA
## 48 Oct-2009 270 3221800 NA
## 49 Oct-2010 NA NA 627
## 50 Oct-2011 1058 15391725 NA
## 51 Sep-2007 NA NA 24
## 52 Sep-2008 20 108875 NA
## 53 Sep-2009 224 2647725 NA
## 54 Sep-2010 544 6369025 NA
## 55 Sep-2011 1045 14743800 NA
## total_exposure.Charged Off
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 133800
## 6 NA
## 7 2151400
## 8 NA
## 9 NA
## 10 819700
## 11 1064075
## 12 NA
## 13 NA
## 14 NA
## 15 1579750
## 16 NA
## 17 NA
## 18 8292250
## 19 NA
## 20 1153800
## 21 NA
## 22 NA
## 23 NA
## 24 NA
## 25 NA
## 26 NA
## 27 NA
## 28 NA
## 29 NA
## 30 1600700
## 31 NA
## 32 NA
## 33 NA
## 34 NA
## 35 NA
## 36 NA
## 37 NA
## 38 NA
## 39 NA
## 40 9986075
## 41 NA
## 42 NA
## 43 NA
## 44 NA
## 45 NA
## 46 279900
## 47 NA
## 48 NA
## 49 7501050
## 50 NA
## 51 202150
## 52 NA
## 53 NA
## 54 NA
## 55 NA
LoanStats_issuey <- sqldf('select issue_y,
loan_status_new, count(*) as cnts, sum(loan_amnt) as total_exposure
from LoanStats2 group by issue_y')
issue_d_tran_y <- reshape(LoanStats_issuey, idvar = "issue_y",
timevar = "loan_status_new",
direction = "wide")
issue_d_tran_y
## issue_y cnts.Fully Paid total_exposure.Fully Paid cnts.Charged Off
## 1 2007 207 1879325 NA
## 2 2008 976 9383825 NA
## 3 2009 NA NA 2194
## 4 2010 5729 68689300 NA
## 5 2011 10670 144057250 NA
## total_exposure.Charged Off
## 1 NA
## 2 NA
## 3 24367150
## 4 NA
## 5 NA
LoanStats_issuem <- sqldf('select issue_m,
loan_status_new, count(*) as cnts, sum(loan_amnt) as total_exposure
from LoanStats2 group by issue_m')
issue_d_tran_m <- reshape(LoanStats_issuem, idvar = "issue_m",
timevar = "loan_status_new",
direction = "wide")
issue_d_tran_m
## issue_m cnts.Fully Paid total_exposure.Fully Paid cnts.Charged Off
## 1 Apr 1297 15841350 NA
## 2 Aug NA NA 1751
## 3 Dec NA NA 2297
## 4 Feb NA NA 1254
## 5 Jan 1268 14691900 NA
## 6 Jul 1635 19514350 NA
## 7 Jun 1487 18441650 NA
## 8 Mar 1319 15729350 NA
## 9 May 1401 17248625 NA
## 10 Nov 2181 28253225 NA
## 11 Oct NA NA 2029
## 12 Sep NA NA 1857
## total_exposure.Charged Off
## 1 NA
## 2 22138000
## 3 31020125
## 4 14714775
## 5 NA
## 6 NA
## 7 NA
## 8 NA
## 9 NA
## 10 NA
## 11 26711925
## 12 24071575
summary(LoanStats2$delinq_2yrs)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.1407 0.0000 13.0000 1
unique(LoanStats2$delinq_2yrs)
## [1] 0 1 2 3 4 6 5 7 8 13 NA
library(gmodels)
CrossTable(LoanStats2$delinq_2yrs,LoanStats2$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 19775
##
##
## | LoanStats2$loan_status_new
## LoanStats2$delinq_2yrs | Charged Off | Fully Paid | Row Total |
## -----------------------|-------------|-------------|-------------|
## 0 | 2729 | 14947 | 17676 |
## | 0.154 | 0.846 | 0.894 |
## -----------------------|-------------|-------------|-------------|
## 1 | 274 | 1367 | 1641 |
## | 0.167 | 0.833 | 0.083 |
## -----------------------|-------------|-------------|-------------|
## 2 | 65 | 256 | 321 |
## | 0.202 | 0.798 | 0.016 |
## -----------------------|-------------|-------------|-------------|
## 3 | 10 | 80 | 90 |
## | 0.111 | 0.889 | 0.005 |
## -----------------------|-------------|-------------|-------------|
## 4 | 5 | 21 | 26 |
## | 0.192 | 0.808 | 0.001 |
## -----------------------|-------------|-------------|-------------|
## 5 | 0 | 11 | 11 |
## | 0.000 | 1.000 | 0.001 |
## -----------------------|-------------|-------------|-------------|
## 6 | 1 | 5 | 6 |
## | 0.167 | 0.833 | 0.000 |
## -----------------------|-------------|-------------|-------------|
## 7 | 0 | 2 | 2 |
## | 0.000 | 1.000 | 0.000 |
## -----------------------|-------------|-------------|-------------|
## 8 | 0 | 1 | 1 |
## | 0.000 | 1.000 | 0.000 |
## -----------------------|-------------|-------------|-------------|
## 13 | 0 | 1 | 1 |
## | 0.000 | 1.000 | 0.000 |
## -----------------------|-------------|-------------|-------------|
## Column Total | 3084 | 16691 | 19775 |
## -----------------------|-------------|-------------|-------------|
##
##
ind <- which(LoanStats2$delinq_2yrs > 0)
LoanStats_d2 <- LoanStats2[ind,]
aaa <-data.frame(CrossTable(LoanStats_d2$delinq_2yrs,LoanStats_d2$loan_status_new,
prop.r=TRUE,prop.c=FALSE, prop.t=FALSE, prop.chisq=FALSE))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 2099
##
##
## | LoanStats_d2$loan_status_new
## LoanStats_d2$delinq_2yrs | Charged Off | Fully Paid | Row Total |
## -------------------------|-------------|-------------|-------------|
## 1 | 274 | 1367 | 1641 |
## | 0.167 | 0.833 | 0.782 |
## -------------------------|-------------|-------------|-------------|
## 2 | 65 | 256 | 321 |
## | 0.202 | 0.798 | 0.153 |
## -------------------------|-------------|-------------|-------------|
## 3 | 10 | 80 | 90 |
## | 0.111 | 0.889 | 0.043 |
## -------------------------|-------------|-------------|-------------|
## 4 | 5 | 21 | 26 |
## | 0.192 | 0.808 | 0.012 |
## -------------------------|-------------|-------------|-------------|
## 5 | 0 | 11 | 11 |
## | 0.000 | 1.000 | 0.005 |
## -------------------------|-------------|-------------|-------------|
## 6 | 1 | 5 | 6 |
## | 0.167 | 0.833 | 0.003 |
## -------------------------|-------------|-------------|-------------|
## 7 | 0 | 2 | 2 |
## | 0.000 | 1.000 | 0.001 |
## -------------------------|-------------|-------------|-------------|
## 8 | 0 | 1 | 1 |
## | 0.000 | 1.000 | 0.000 |
## -------------------------|-------------|-------------|-------------|
## 13 | 0 | 1 | 1 |
## | 0.000 | 1.000 | 0.000 |
## -------------------------|-------------|-------------|-------------|
## Column Total | 355 | 1744 | 2099 |
## -------------------------|-------------|-------------|-------------|
##
##
b <- data.frame(aaa$t.x,aaa$t.y,aaa$t.Freq,aaa$prop.row.Freq)
c <- reshape(b, idvar = "aaa.t.x",
timevar = "aaa.t.y",
direction = "wide")
p <- ggplot()+geom_bar(data=c,
aes(x=c$aaa.t.x,
y=c$`aaa.t.Freq.Charged Off`),
stat="identity",fill = "#CCCCFF", colour="black") +
labs(x = "Nos of 30 DPD in last 2 Years", y = "")
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=c,aes(c$aaa.t.x, c$`aaa.prop.row.Freq.Charged Off`*1000, group = 1, colour = "% Charge off in Bin"), size = 1 )
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin"),
values = c("% Charge off in Bin"="#ff66ff"))
## <ggproto object: Class ScaleDiscrete, Scale>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: % Charge off in Bin
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: Legend
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range>
## reset: function
## scale_name: manual
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale>
p <- p + geom_point(data=c,aes(c$aaa.t.x,
c$`aaa.prop.row.Freq.Charged Off`*1000), colour = "#ff66ff", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./10, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
ind <- which(is.na(LoanStats2$earliest_cr_line))
LoanStats_cl <- LoanStats2[-ind,]
LoanStats_cl$earliest_cr_line_n <- as.Date(gsub(" ","",
paste("01-",
LoanStats_cl$earliest_cr_line))
,"%d-%b-%Y")
summary(LoanStats_cl$earliest_cr_line_n)
## Min. 1st Qu. Median Mean 3rd Qu.
## "1954-10-01" "1994-01-01" "1998-05-01" "1997-02-14" "2001-07-01"
## Max.
## "2008-09-01"
plot(LoanStats_cl$earliest_cr_line_n, ylab = "First credit line date")
hist(LoanStats_cl$earliest_cr_line_n, breaks = 20,
xlab = "First credit line date", col="#CCCCFF",
main="Histogram of First credit line date Distribution")
a <- quantile(as.numeric(LoanStats_cl$earliest_cr_line_n),1)
b <- quantile(as.numeric(LoanStats_cl$earliest_cr_line_n),0)
ind_la <- which(as.numeric(LoanStats_cl$earliest_cr_line_n) < a &
as.numeric(LoanStats_cl$earliest_cr_line_n) > b)
LoanStats_cl_a <- LoanStats_cl[ind_la,]
hist(LoanStats_cl_a$earliest_cr_line_n, breaks = 20,
xlab = "First credit line date", col="#CCCCFF",
main="Histogram of First credit line date Distribution")
summary(LoanStats_cl$earliest_cr_line_n)
## Min. 1st Qu. Median Mean 3rd Qu.
## "1954-10-01" "1994-01-01" "1998-05-01" "1997-02-14" "2001-07-01"
## Max.
## "2008-09-01"
summary(LoanStats_cl_a$earliest_cr_line_n)
## Min. 1st Qu. Median Mean 3rd Qu.
## "1954-11-01" "1994-01-01" "1998-05-01" "1997-02-14" "2001-07-01"
## Max.
## "2008-08-01"
LoanStats_cl$as_of_date <- format(Sys.Date(), format="%Y-%m-%d")
#LoanStats_cl$No_days_e_cr_lines <- LoanStats_cl$as_of_date - LoanStats_cl$earliest_cr_line_n
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_cl_a)/no_bins
no_cases_bin
LoanStats_cl_a$tot_loan_amt <- as.numeric(sum(LoanStats_cl_a[which(LoanStats_cl_a$loan_status_new == 'Charged Off'),]$loan_amnt))
LoanStats_cl_a$tot_loan_cnt <- as.numeric(nrow(LoanStats_cl_a[which(LoanStats_cl_a$loan_status_new == 'Charged Off'),]))
LoanStats_cl_a <- LoanStats_cl_a[order(LoanStats_cl_a$earliest_cr_line_n),]
LoanStats_cl_a$earliest_cr_line_n_bin <- cut(LoanStats_cl_a$earliest_cr_line_n,no_bins)
earliest_cr_line_n_bin <- sqldf('select earliest_cr_line_n_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_cl_a group by earliest_cr_line_n_bin, loan_status_new')
earliest_cr_line_n_bin_tran <- reshape(earliest_cr_line_n_bin, idvar = "earliest_cr_line_n_bin",
timevar = "loan_status_new",
direction = "wide")
earliest_cr_line_n_bin_tran <-data.frame(earliest_cr_line_n_bin=earliest_cr_line_n_bin_tran$earliest_cr_line_n_bin,
charge_off_Cnts=earliest_cr_line_n_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=earliest_cr_line_n_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=earliest_cr_line_n_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=earliest_cr_line_n_bin_tran$`exposure.Fully Paid`,
annual_inc_total=earliest_cr_line_n_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=earliest_cr_line_n_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=earliest_cr_line_n_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(earliest_cr_line_n_bin_tran$`cnts.Charged Off`/(earliest_cr_line_n_bin_tran$`cnts.Charged Off`+
earliest_cr_line_n_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(earliest_cr_line_n_bin_tran$`exposure.Charged Off`/(earliest_cr_line_n_bin_tran$`exposure.Charged Off`+
earliest_cr_line_n_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(earliest_cr_line_n_bin_tran$`cnts.Charged Off`/earliest_cr_line_n_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(earliest_cr_line_n_bin_tran$`exposure.Charged Off`/earliest_cr_line_n_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- earliest_cr_line_n_bin_tran[,c("earliest_cr_line_n_bin","pct_cnts_grp")]
df2 <- earliest_cr_line_n_bin_tran[,c("earliest_cr_line_n_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=earliest_cr_line_n_bin_tran,
aes(x=earliest_cr_line_n_bin_tran$earliest_cr_line_n_bin,
y=earliest_cr_line_n_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "First credit line date bin", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$earliest_cr_line_n_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$earliest_cr_line_n_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$earliest_cr_line_n_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$earliest_cr_line_n_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
p3 <- ggplot() +
geom_line(data=df1,aes(df1$earliest_cr_line_n_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$earliest_cr_line_n_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "First credit line date bin", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$earliest_cr_line_n_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$earliest_cr_line_n_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$earliest_cr_line_n_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$earliest_cr_line_n_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
retList <- list(p,p3)
return(retList)
}
bin_plot(10)
## [[1]]
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
##
## [[2]]
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).
ind <- which(is.na(LoanStats2$last_pymnt_d))
LoanStats_cl <- LoanStats2[-ind,]
LoanStats_cl$last_pymnt_d_n <- as.Date(gsub(" ","",
paste("01-",
LoanStats_cl$last_pymnt_d))
,"%d-%b-%Y")
summary(LoanStats_cl$last_pymnt_d_n)
## Min. 1st Qu. Median Mean 3rd Qu.
## "2008-01-01" "2012-03-01" "2013-04-01" "2013-04-02" "2014-05-01"
## Max.
## "2017-04-01"
plot(LoanStats_cl$last_pymnt_d_n, ylab = "Last Payment date")
hist(LoanStats_cl$last_pymnt_d_n, breaks = 20,
xlab = "Last Payment date", col="#CCCCFF",
main="Histogram of Last Payment date Distribution")
a <- quantile(as.numeric(LoanStats_cl$last_pymnt_d_n),1)
b <- quantile(as.numeric(LoanStats_cl$last_pymnt_d_n),0)
ind_la <- which(as.numeric(LoanStats_cl$last_pymnt_d_n) < a &
as.numeric(LoanStats_cl$last_pymnt_d_n) > b)
LoanStats_cl_a <- LoanStats_cl[ind_la,]
hist(LoanStats_cl_a$last_pymnt_d_n, breaks = 20,
xlab = "Last Payment date", col="#CCCCFF",
main="Histogram of Last Payment date Distribution")
summary(LoanStats_cl$last_pymnt_d_n)
## Min. 1st Qu. Median Mean 3rd Qu.
## "2008-01-01" "2012-03-01" "2013-04-01" "2013-04-02" "2014-05-01"
## Max.
## "2017-04-01"
summary(LoanStats_cl_a$last_pymnt_d_n)
## Min. 1st Qu. Median Mean 3rd Qu.
## "2008-02-01" "2012-03-01" "2013-04-01" "2013-04-02" "2014-05-01"
## Max.
## "2017-02-01"
LoanStats_cl$as_of_date <- format(Sys.Date(), format="%Y-%m-%d")
#LoanStats_cl$No_days_e_cr_lines <- as.numeric(LoanStats_cl$as_of_date) - as.numeric(LoanStats_cl$last_pymnt_d_n)
bin_plot <- function(x) {
no_bins <- x
library(Hmisc)
no_cases_bin <- nrow(LoanStats_cl_a)/no_bins
no_cases_bin
LoanStats_cl_a$tot_loan_amt <- as.numeric(sum(LoanStats_cl_a[which(LoanStats_cl_a$loan_status_new == 'Charged Off'),]$loan_amnt))
LoanStats_cl_a$tot_loan_cnt <- as.numeric(nrow(LoanStats_cl_a[which(LoanStats_cl_a$loan_status_new == 'Charged Off'),]))
LoanStats_cl_a <- LoanStats_cl_a[order(LoanStats_cl_a$last_pymnt_d_n),]
LoanStats_cl_a$last_pymnt_d_n_bin <- cut(LoanStats_cl_a$last_pymnt_d_n,no_bins)
last_pymnt_d_n_bin <- sqldf('select last_pymnt_d_n_bin, loan_status_new, count(*) as
cnts, sum(loan_amnt) as exposure,
tot_loan_amt, tot_loan_cnt,
sum(annual_inc) as annual_inc_total,
(sum(loan_amnt)/tot_loan_amt)*100 as pct_loan
from LoanStats_cl_a group by last_pymnt_d_n_bin, loan_status_new')
last_pymnt_d_n_bin_tran <- reshape(last_pymnt_d_n_bin, idvar = "last_pymnt_d_n_bin",
timevar = "loan_status_new",
direction = "wide")
last_pymnt_d_n_bin_tran <-data.frame(last_pymnt_d_n_bin=last_pymnt_d_n_bin_tran$last_pymnt_d_n_bin,
charge_off_Cnts=last_pymnt_d_n_bin_tran$`cnts.Charged Off`,
Fully_paid_Cnts=last_pymnt_d_n_bin_tran$`cnts.Fully Paid`,
charge_off_Exp=last_pymnt_d_n_bin_tran$`exposure.Charged Off`,
Fully_paid_Exp=last_pymnt_d_n_bin_tran$`exposure.Fully Paid`,
annual_inc_total=last_pymnt_d_n_bin_tran$`annual_inc_total.Charged Off`,
tot_loan_amt=last_pymnt_d_n_bin_tran$`tot_loan_amt.Charged Off`,
tot_loan_cnt=last_pymnt_d_n_bin_tran$`tot_loan_cnt.Charged Off`,
pct_cnts_grp=(last_pymnt_d_n_bin_tran$`cnts.Charged Off`/(last_pymnt_d_n_bin_tran$`cnts.Charged Off`+
last_pymnt_d_n_bin_tran$`cnts.Fully Paid`))*100,
pct_exp_grp=(last_pymnt_d_n_bin_tran$`exposure.Charged Off`/(last_pymnt_d_n_bin_tran$`exposure.Charged Off`+
last_pymnt_d_n_bin_tran$`exposure.Fully Paid`))*100,
pct_cnts_tot=(last_pymnt_d_n_bin_tran$`cnts.Charged Off`/last_pymnt_d_n_bin_tran$`tot_loan_cnt.Charged Off`)*100,
pct_exp_tot=(last_pymnt_d_n_bin_tran$`exposure.Charged Off`/last_pymnt_d_n_bin_tran$`tot_loan_amt.Charged Off`)*100)
df1 <- last_pymnt_d_n_bin_tran[,c("last_pymnt_d_n_bin","pct_cnts_grp")]
df2 <- last_pymnt_d_n_bin_tran[,c("last_pymnt_d_n_bin","pct_cnts_tot")]
p <- ggplot()+geom_bar(data=last_pymnt_d_n_bin_tran,
aes(x=last_pymnt_d_n_bin_tran$last_pymnt_d_n_bin,
y=last_pymnt_d_n_bin_tran$charge_off_Cnts),
stat="identity",fill = "#CCCCFF", colour="black") + labs(x = "Last Payment date bin", y = "")
p <- p + theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p <- p + ggtitle("Number of Charge offs Percent of Charge offs\n")+
theme(plot.title = element_text(lineheight=.8, face="bold",size = 8))
p <- p +
geom_line(data=df1,aes(df1$last_pymnt_d_n_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
geom_line(data=df2,aes(df2$last_pymnt_d_n_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))
p <- p + geom_point(data=df1,aes(df1$last_pymnt_d_n_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)
p <- p + geom_point(data=df2,aes(df2$last_pymnt_d_n_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)
p <- p +scale_y_continuous(sec.axis = sec_axis(~./5, name = derive())) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_line(color = "gray50", size = 0.5),
panel.grid.major.x = element_blank())
p
p3 <- ggplot() +
geom_line(data=df1,aes(df1$last_pymnt_d_n_bin, df1$pct_cnts_grp*5, group = 1, colour = "% Charge off in Bin"), size = 1 )+
#geom_line(data=df2,aes(df2$last_pymnt_d_n_bin, df2$pct_cnts_tot*5, group = 1, colour = "% Charge off by total Charge off"),size = 1) +
scale_colour_manual(name="Legend",breaks = c("% Charge off in Bin","% Charge off by total Charge off"),
values = c("% Charge off in Bin"="#ff66ff","% Charge off by total Charge off"="#43f906"))+
labs(x = "Last Payment date bin", y = "Percent of Charge offs")
theme(axis.text.x=element_text(angle=45,hjust=1,vjust=01,size = 10, face="bold"),
axis.text.y=element_text(size = 10, face="bold"))
p3 <- p3 + geom_point(data=df1,aes(df1$last_pymnt_d_n_bin, df1$pct_cnts_grp*5), colour = "#ff66ff", size =2.5, shape = 17)+
geom_text(data=df1,aes(df1$last_pymnt_d_n_bin, df1$pct_cnts_grp*5,label=round(df1$pct_cnts_grp)),hjust=0, vjust=0)
#p3 <- p3 + geom_point(data=df2,aes(df2$last_pymnt_d_n_bin, df2$pct_cnts_tot*5), colour = "#43f906", size =2.5, shape = 17)+
# geom_text(data=df2,aes(df2$last_pymnt_d_n_bin, df2$pct_cnts_tot*5,label=round(df2$pct_cnts_tot)),hjust=0, vjust=0)
p3 <- p3 + theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank())
retList <- list(p,p3)
return(retList)
}
bin_plot(12)
## [[1]]
##
## [[2]]